This commit is contained in:
Michiel de Hoon
2025-01-14 16:59:17 +09:00
committed by Peter Cock
parent 55add62e03
commit de8b8e2ec5
48 changed files with 5 additions and 16491 deletions

View File

@ -1,270 +0,0 @@
# Copyright 2011 by Andreas Wilm. All rights reserved.
# Based on ClustalW wrapper copyright 2009 by Cymon J. Cox.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program Clustal Omega."""
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class ClustalOmegaCommandline(AbstractCommandline):
"""Command line wrapper for clustal omega.
http://www.clustal.org/omega
Notes
-----
Last checked against version: 1.2.0
References
----------
Sievers F, Wilm A, Dineen DG, Gibson TJ, Karplus K, Li W, Lopez R,
McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG (2011).
Fast, scalable generation of high-quality protein multiple
sequence alignments using Clustal Omega.
Molecular Systems Biology 7:539 https://doi.org/10.1038/msb.2011.75
Examples
--------
>>> from Bio.Align.Applications import ClustalOmegaCommandline
>>> in_file = "unaligned.fasta"
>>> out_file = "aligned.fasta"
>>> clustalomega_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True)
>>> print(clustalomega_cline)
clustalo -i unaligned.fasta -o aligned.fasta --auto -v
You would typically run the command line with clustalomega_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="clustalo", **kwargs):
"""Initialize the class."""
# order parameters in the same order as clustalo --help
self.parameters = [
# Sequence Input
_Option(
["-i", "--in", "--infile", "infile"],
"Multiple sequence input file",
filename=True,
equate=False,
),
_Option(
["--hmm-in", "HMM input", "hmm_input"],
"HMM input files",
filename=True,
equate=False,
),
_Switch(["--dealign", "dealign"], "Dealign input sequences"),
_Option(
["--profile1", "--p1", "profile1"],
"Pre-aligned multiple sequence file (aligned columns will be kept fix).",
filename=True,
equate=False,
),
_Option(
["--profile2", "--p2", "profile2"],
"Pre-aligned multiple sequence file (aligned columns will be kept fix).",
filename=True,
equate=False,
),
_Option(
["-t", "--seqtype", "seqtype"],
"{Protein, RNA, DNA} Force a sequence type (default: auto).",
equate=False,
checker_function=lambda x: x
in ["protein", "rna", "dna", "Protein", "RNA", "DNA", "PROTEIN"],
),
_Switch(
["--is-profile", "isprofile"],
"disable check if profile, force profile (default no)",
),
_Option(
["--infmt", "infmt"],
"""Forced sequence input file format (default: auto)
Allowed values: a2m, fa[sta], clu[stal], msf, phy[lip], selex, st[ockholm], vie[nna]
""",
equate=False,
checker_function=lambda x: x
in [
"a2m",
"fa",
"fasta",
"clu",
"clustal",
"msf",
"phy",
"phylip",
"selex",
"st",
"stockholm",
"vie",
"vienna",
],
),
# Clustering
_Option(
["--distmat-in", "distmat_in"],
"Pairwise distance matrix input file (skips distance computation).",
filename=True,
equate=False,
),
_Option(
["--distmat-out", "distmat_out"],
"Pairwise distance matrix output file.",
filename=True,
equate=False,
),
_Option(
["--guidetree-in", "guidetree_in"],
"Guide tree input file (skips distance computation and guide-tree clustering step).",
filename=True,
equate=False,
),
_Option(
["--guidetree-out", "guidetree_out"],
"Guide tree output file.",
filename=True,
equate=False,
),
_Switch(
["--full", "distmat_full"],
"Use full distance matrix for guide-tree calculation (slow; mBed is default)",
),
_Switch(
["--full-iter", "distmat_full_iter"],
"Use full distance matrix for guide-tree calculation during iteration (mBed is default)",
),
_Option(
["--cluster-size", "clustersize"],
"soft maximum of sequences in sub-clusters",
checker_function=lambda x: isinstance(x, int),
),
_Option(
["--clustering-out", "clusteringout"],
"Clustering output file",
filename=True,
),
_Switch(
["--use-kimura", "usekimura"],
"use Kimura distance correction for aligned sequences (default no)",
),
_Switch(
["--percent-id", "percentid"],
"convert distances into percent identities (default no)",
),
# Alignment Output
_Option(
["-o", "--out", "--outfile", "outfile"],
"Multiple sequence alignment output file (default: stdout).",
filename=True,
equate=False,
),
_Option(
["--outfmt", "outfmt"],
"MSA output file format:"
" a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]"
" (default: fasta).",
equate=False,
checker_function=lambda x: x
in [
"a2m",
"fa",
"fasta",
"clu",
"clustal",
"msf",
"phy",
"phylip",
"selex",
"st",
"stockholm",
"vie",
"vienna",
],
),
_Switch(
["--residuenumber", "--resno", "residuenumber"],
"in Clustal format print residue numbers (default no)",
),
_Option(
["--wrap", "wrap"],
"number of residues before line-wrap in output",
checker_function=lambda x: isinstance(x, int),
),
_Option(
["--output-order", "outputorder"],
"MSA output order like in input/guide-tree",
checker_function=lambda x: x in ["input-order", "tree-order"],
),
# Iteration
_Option(
["--iterations", "--iter", "iterations"],
"Number of (combined guide-tree/HMM) iterations",
equate=False,
checker_function=lambda x: isinstance(x, int),
),
_Option(
["--max-guidetree-iterations", "max_guidetree_iterations"],
"Maximum number of guidetree iterations",
equate=False,
checker_function=lambda x: isinstance(x, int),
),
_Option(
["--max-hmm-iterations", "max_hmm_iterations"],
"Maximum number of HMM iterations",
equate=False,
checker_function=lambda x: isinstance(x, int),
),
# Limits (will exit early, if exceeded):
_Option(
["--maxnumseq", "maxnumseq"],
"Maximum allowed number of sequences",
equate=False,
checker_function=lambda x: isinstance(x, int),
),
_Option(
["--maxseqlen", "maxseqlen"],
"Maximum allowed sequence length",
equate=False,
checker_function=lambda x: isinstance(x, int),
),
# Miscellaneous:
_Switch(
["--auto", "auto"],
"Set options automatically (might overwrite some of your options)",
),
_Option(
["--threads", "threads"],
"Number of processors to use",
equate=False,
checker_function=lambda x: isinstance(x, int),
),
_Option(
["-l", "--log", "log"],
"Log all non-essential output to this file.",
filename=True,
equate=False,
),
_Switch(["-h", "--help", "help"], "Print help and exit."),
_Switch(["-v", "--verbose", "verbose"], "Verbose output"),
_Switch(["--version", "version"], "Print version information and exit"),
_Switch(
["--long-version", "long_version"],
"Print long version information and exit",
),
_Switch(["--force", "force"], "Force file overwriting."),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,488 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program Clustal W."""
import os
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class ClustalwCommandline(AbstractCommandline):
"""Command line wrapper for clustalw (version one or two).
http://www.clustal.org/
Notes
-----
Last checked against versions: 1.83 and 2.1
References
----------
Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,
McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD,
Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0.
Bioinformatics, 23, 2947-2948.
Examples
--------
>>> from Bio.Align.Applications import ClustalwCommandline
>>> in_file = "unaligned.fasta"
>>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file)
>>> print(clustalw_cline)
clustalw2 -infile=unaligned.fasta
You would typically run the command line with clustalw_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
# TODO - Should we default to cmd="clustalw2" now?
def __init__(self, cmd="clustalw", **kwargs):
"""Initialize the class."""
self.parameters = [
_Option(
["-infile", "-INFILE", "INFILE", "infile"],
"Input sequences.",
filename=True,
),
_Option(
["-profile1", "-PROFILE1", "PROFILE1", "profile1"],
"Profiles (old alignment).",
filename=True,
),
_Option(
["-profile2", "-PROFILE2", "PROFILE2", "profile2"],
"Profiles (old alignment).",
filename=True,
),
# ################# VERBS (do things) #############################
_Switch(
["-options", "-OPTIONS", "OPTIONS", "options"],
"List the command line parameters",
),
_Switch(
["-help", "-HELP", "HELP", "help"], "Outline the command line params."
),
_Switch(
["-check", "-CHECK", "CHECK", "check"],
"Outline the command line params.",
),
_Switch(
["-fullhelp", "-FULLHELP", "FULLHELP", "fullhelp"],
"Output full help content.",
),
_Switch(
["-align", "-ALIGN", "ALIGN", "align"], "Do full multiple alignment."
),
_Switch(["-tree", "-TREE", "TREE", "tree"], "Calculate NJ tree."),
_Switch(
["-pim", "-PIM", "PIM", "pim"],
"Output percent identity matrix (while calculating the tree).",
),
_Option(
["-bootstrap", "-BOOTSTRAP", "BOOTSTRAP", "bootstrap"],
"Bootstrap a NJ tree (n= number of bootstraps; def. = 1000).",
checker_function=lambda x: isinstance(x, int),
),
_Switch(
["-convert", "-CONVERT", "CONVERT", "convert"],
"Output the input sequences in a different file format.",
),
# #################### PARAMETERS (set things) #########################
# ***General settings:****
# Makes no sense in biopython
# _Option(["-interactive", "-INTERACTIVE", "INTERACTIVE", "interactive"],
# [],
# lambda x: 0, # Does not take value
# False,
# "read command line, then enter normal interactive menus",
# False),
_Switch(
["-quicktree", "-QUICKTREE", "QUICKTREE", "quicktree"],
"Use FAST algorithm for the alignment guide tree",
),
_Option(
["-type", "-TYPE", "TYPE", "type"],
"PROTEIN or DNA sequences",
checker_function=lambda x: x in ["PROTEIN", "DNA", "protein", "dna"],
),
_Switch(
["-negative", "-NEGATIVE", "NEGATIVE", "negative"],
"Protein alignment with negative values in matrix",
),
_Option(
["-outfile", "-OUTFILE", "OUTFILE", "outfile"],
"Output sequence alignment file name",
filename=True,
),
_Option(
["-output", "-OUTPUT", "OUTPUT", "output"],
"Output format: CLUSTAL(default), GCG, GDE, PHYLIP, PIR, NEXUS and FASTA",
checker_function=lambda x: x
in [
"CLUSTAL",
"GCG",
"GDE",
"PHYLIP",
"PIR",
"NEXUS",
"FASTA",
"clustal",
"gcg",
"gde",
"phylip",
"pir",
"nexus",
"fasta",
],
),
_Option(
["-outorder", "-OUTORDER", "OUTORDER", "outorder"],
"Output taxon order: INPUT or ALIGNED",
checker_function=lambda x: x
in ["INPUT", "input", "ALIGNED", "aligned"],
),
_Option(
["-case", "-CASE", "CASE", "case"],
"LOWER or UPPER (for GDE output only)",
checker_function=lambda x: x in ["UPPER", "upper", "LOWER", "lower"],
),
_Option(
["-seqnos", "-SEQNOS", "SEQNOS", "seqnos"],
"OFF or ON (for Clustal output only)",
checker_function=lambda x: x in ["ON", "on", "OFF", "off"],
),
_Option(
["-seqno_range", "-SEQNO_RANGE", "SEQNO_RANGE", "seqno_range"],
"OFF or ON (NEW- for all output formats)",
checker_function=lambda x: x in ["ON", "on", "OFF", "off"],
),
_Option(
["-range", "-RANGE", "RANGE", "range"],
"Sequence range to write starting m to m+n. "
"Input as string eg. '24,200'",
),
_Option(
["-maxseqlen", "-MAXSEQLEN", "MAXSEQLEN", "maxseqlen"],
"Maximum allowed input sequence length",
checker_function=lambda x: isinstance(x, int),
),
_Switch(
["-quiet", "-QUIET", "QUIET", "quiet"],
"Reduce console output to minimum",
),
_Option(
["-stats", "-STATS", "STATS", "stats"],
"Log some alignment statistics to file",
filename=True,
),
# ***Fast Pairwise Alignments:***
_Option(
["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"],
"Word size",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"],
"Number of best diags.",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-window", "-WINDOW", "WINDOW", "window"],
"Window around best diags.",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"],
"Gap penalty",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-score", "-SCORE", "SCORE", "score"],
"Either: PERCENT or ABSOLUTE",
checker_function=lambda x: x
in ["percent", "PERCENT", "absolute", "ABSOLUTE"],
),
# ***Slow Pairwise Alignments:***
_Option(
["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"],
"Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
checker_function=lambda x: (
x
in [
"BLOSUM",
"PAM",
"GONNET",
"ID",
"blosum",
"pam",
"gonnet",
"id",
]
or os.path.exists(x)
),
filename=True,
),
_Option(
["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"],
"DNA weight matrix=IUB, CLUSTALW or filename",
checker_function=lambda x: (
x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x)
),
filename=True,
),
_Option(
["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"],
"Gap opening penalty",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"],
"Gap extension penalty",
checker_function=lambda x: (isinstance(x, (float, int))),
),
# ***Multiple Alignments:***
_Option(
["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
"Output file name for newly created guide tree",
filename=True,
),
_Option(
["-usetree", "-USETREE", "USETREE", "usetree"],
"File name of guide tree",
checker_function=lambda x: os.path.exists,
filename=True,
),
_Option(
["-matrix", "-MATRIX", "MATRIX", "matrix"],
"Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
checker_function=lambda x: (
x
in [
"BLOSUM",
"PAM",
"GONNET",
"ID",
"blosum",
"pam",
"gonnet",
"id",
]
or os.path.exists(x)
),
filename=True,
),
_Option(
["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"],
"DNA weight matrix=IUB, CLUSTALW or filename",
checker_function=lambda x: (
x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x)
),
filename=True,
),
_Option(
["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"],
"Gap opening penalty",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-gapext", "-GAPEXT", "GAPEXT", "gapext"],
"Gap extension penalty",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Switch(
["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"],
"No end gap separation pen.",
),
_Option(
["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"],
"Gap separation pen. range",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Switch(
["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"], "Residue-specific gaps off"
),
_Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"], "Hydrophilic gaps off"),
_Switch(
["-hgapresidues", "-HGAPRESIDUES", "HGAPRESIDUES", "hgapresidues"],
"List hydrophilic res.",
),
_Option(
["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"],
"% ident. for delay",
checker_function=lambda x: (isinstance(x, (float, int))),
),
# Already handled in General Settings section, but appears a second
# time under Multiple Alignments in the help
# _Option(["-type", "-TYPE", "TYPE", "type"],
# "PROTEIN or DNA",
# checker_function=lambda x: x in ["PROTEIN", "DNA",
# "protein", "dna"]),
_Option(
["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"],
"Transitions weighting",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-iteration", "-ITERATION", "ITERATION", "iteration"],
"NONE or TREE or ALIGNMENT",
checker_function=lambda x: x
in ["NONE", "TREE", "ALIGNMENT", "none", "tree", "alignment"],
),
_Option(
["-numiter", "-NUMITER", "NUMITER", "numiter"],
"maximum number of iterations to perform",
checker_function=lambda x: isinstance(x, int),
),
_Switch(
["-noweights", "-NOWEIGHTS", "NOWEIGHTS", "noweights"],
"Disable sequence weighting",
),
# ***Profile Alignments:***
_Switch(
["-profile", "-PROFILE", "PROFILE", "profile"],
"Merge two alignments by profile alignment",
),
_Option(
["-newtree1", "-NEWTREE1", "NEWTREE1", "newtree1"],
"Output file name for new guide tree of profile1",
filename=True,
),
_Option(
["-newtree2", "-NEWTREE2", "NEWTREE2", "newtree2"],
"Output file for new guide tree of profile2",
filename=True,
),
_Option(
["-usetree1", "-USETREE1", "USETREE1", "usetree1"],
"File name of guide tree for profile1",
checker_function=lambda x: os.path.exists,
filename=True,
),
_Option(
["-usetree2", "-USETREE2", "USETREE2", "usetree2"],
"File name of guide tree for profile2",
checker_function=lambda x: os.path.exists,
filename=True,
),
# ***Sequence to Profile Alignments:***
_Switch(
["-sequences", "-SEQUENCES", "SEQUENCES", "sequences"],
"Sequentially add profile2 sequences to profile1 alignment",
),
# These are already handled in the Multiple Alignments section,
# but appear a second time here in the help.
# _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
# "File for new guide tree",
# filename=True),
# _Option(["-usetree", "-USETREE", "USETREE", "usetree"],
# "File for old guide tree",
# checker_function=lambda x: os.path.exists,
# filename=True),
# ***Structure Alignments:***
_Switch(
["-nosecstr1", "-NOSECSTR1", "NOSECSTR1", "nosecstr1"],
"Do not use secondary structure-gap penalty mask for profile 1",
),
_Switch(
["-nosecstr2", "-NOSECSTR2", "NOSECSTR2", "nosecstr2"],
"Do not use secondary structure-gap penalty mask for profile 2",
),
_Option(
["-secstrout", "-SECSTROUT", "SECSTROUT", "secstrout"],
"STRUCTURE or MASK or BOTH or NONE output in alignment file",
checker_function=lambda x: x
in [
"STRUCTURE",
"MASK",
"BOTH",
"NONE",
"structure",
"mask",
"both",
"none",
],
),
_Option(
["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"],
"Gap penalty for helix core residues",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"],
"gap penalty for strand core residues",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"],
"Gap penalty for loop regions",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"],
"Gap penalty for structure termini",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Option(
["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"],
"Number of residues inside helix to be treated as terminal",
checker_function=lambda x: isinstance(x, int),
),
_Option(
["-helixendout", "-HELIXENDOUT", "HELIXENDOUT", "helixendout"],
"Number of residues outside helix to be treated as terminal",
checker_function=lambda x: isinstance(x, int),
),
_Option(
["-strandendin", "-STRANDENDIN", "STRANDENDIN", "strandendin"],
"Number of residues inside strand to be treated as terminal",
checker_function=lambda x: isinstance(x, int),
),
_Option(
["-strandendout", "-STRANDENDOUT", "STRANDENDOUT", "strandendout"],
"Number of residues outside strand to be treated as terminal",
checker_function=lambda x: isinstance(x, int),
),
# ***Trees:***
_Option(
["-outputtree", "-OUTPUTTREE", "OUTPUTTREE", "outputtree"],
"nj OR phylip OR dist OR nexus",
checker_function=lambda x: x
in ["NJ", "PHYLIP", "DIST", "NEXUS", "nj", "phylip", "dist", "nexus"],
),
_Option(
["-seed", "-SEED", "SEED", "seed"],
"Seed number for bootstraps.",
checker_function=lambda x: isinstance(x, int),
),
_Switch(
["-kimura", "-KIMURA", "KIMURA", "kimura"], "Use Kimura's correction."
),
_Switch(
["-tossgaps", "-TOSSGAPS", "TOSSGAPS", "tossgaps"],
"Ignore positions with gaps.",
),
_Option(
["-bootlabels", "-BOOTLABELS", "BOOTLABELS", "bootlabels"],
"Node OR branch position of bootstrap values in tree display",
checker_function=lambda x: x in ["NODE", "BRANCH", "node", "branch"],
),
_Option(
["-clustering", "-CLUSTERING", "CLUSTERING", "clustering"],
"NJ or UPGMA",
checker_function=lambda x: x in ["NJ", "UPGMA", "nj", "upgma"],
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,246 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program DIALIGN2-2."""
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class DialignCommandline(AbstractCommandline):
"""Command line wrapper for the multiple alignment program DIALIGN2-2.
http://bibiserv.techfak.uni-bielefeld.de/dialign/welcome.html
Notes
-----
Last checked against version: 2.2
References
----------
B. Morgenstern (2004). DIALIGN: Multiple DNA and Protein Sequence
Alignment at BiBiServ. Nucleic Acids Research 32, W33-W36.
Examples
--------
To align a FASTA file (unaligned.fasta) with the output files names
aligned.* including a FASTA output file (aligned.fa), use:
>>> from Bio.Align.Applications import DialignCommandline
>>> dialign_cline = DialignCommandline(input="unaligned.fasta",
... fn="aligned", fa=True)
>>> print(dialign_cline)
dialign2-2 -fa -fn aligned unaligned.fasta
You would typically run the command line with dialign_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="dialign2-2", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = [
_Switch(
["-afc", "afc"],
r"Creates additional output file '\*.afc' "
"containing data of all fragments considered "
"for alignment WARNING: this file can be HUGE !",
),
_Switch(
["-afc_v", "afc_v"],
"Like '-afc' but verbose: fragments are explicitly "
"printed. WARNING: this file can be EVEN BIGGER !",
),
_Switch(
["-anc", "anc"],
"Anchored alignment. Requires a file <seq_file>.anc "
"containing anchor points.",
),
_Switch(
["-cs", "cs"],
"If segments are translated, not only the 'Watson "
"strand' but also the 'Crick strand' is looked at.",
),
_Switch(["-cw", "cw"], "Additional output file in CLUSTAL W format."),
_Switch(
["-ds", "ds"],
"'dna alignment speed up' - non-translated nucleic acid "
"fragments are taken into account only if they start "
"with at least two matches. Speeds up DNA alignment at "
"the expense of sensitivity.",
),
_Switch(["-fa", "fa"], "Additional output file in FASTA format."),
_Switch(
["-ff", "ff"],
r"Creates file \*.frg containing information about all "
"fragments that are part of the respective optimal "
"pairwise alignmnets plus information about "
"consistency in the multiple alignment",
),
_Option(
["-fn", "fn"],
"Output files are named <out_file>.<extension>.",
equate=False,
),
_Switch(
["-fop", "fop"],
r"Creates file \*.fop containing coordinates of all "
"fragments that are part of the respective pairwise alignments.",
),
_Switch(
["-fsm", "fsm"],
r"Creates file \*.fsm containing coordinates of all "
"fragments that are part of the final alignment",
),
_Switch(
["-iw", "iw"],
"Overlap weights switched off (by default, overlap "
"weights are used if up to 35 sequences are aligned). "
"This option speeds up the alignment but may lead "
"to reduced alignment quality.",
),
_Switch(
["-lgs", "lgs"],
"'long genomic sequences' - combines the following "
"options: -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff, "
"-fop, -ff, -cs, -ds, -pst ",
),
_Switch(
["-lgs_t", "lgs_t"],
"Like '-lgs' but with all segment pairs assessed "
"at the peptide level (rather than 'mixed alignments' "
"as with the '-lgs' option). Therefore faster than "
"-lgs but not very sensitive for non-coding regions.",
),
_Option(
["-lmax", "lmax"],
"Maximum fragment length = x (default: x = 40 or "
"x = 120 for 'translated' fragments). Shorter x "
"speeds up the program but may affect alignment quality.",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Switch(
["-lo", "lo"],
r"(Long Output) Additional file \*.log with information "
"about fragments selected for pairwise alignment and "
"about consistency in multi-alignment procedure.",
),
_Switch(
["-ma", "ma"],
"'mixed alignments' consisting of P-fragments and "
"N-fragments if nucleic acid sequences are aligned.",
),
_Switch(
["-mask", "mask"],
"Residues not belonging to selected fragments are "
r"replaced by '\*' characters in output alignment "
"(rather than being printed in lower-case characters)",
),
_Switch(
["-mat", "mat"],
r"Creates file \*mat with substitution counts derived "
"from the fragments that have been selected for alignment.",
),
_Switch(
["-mat_thr", "mat_thr"],
"Like '-mat' but only fragments with weight score "
"> t are considered",
),
_Switch(
["-max_link", "max_link"],
"'maximum linkage' clustering used to construct "
"sequence tree (instead of UPGMA).",
),
_Switch(["-min_link", "min_link"], "'minimum linkage' clustering used."),
_Option(["-mot", "mot"], "'motif' option.", equate=False),
_Switch(["-msf", "msf"], "Separate output file in MSF format."),
_Switch(
["-n", "n"],
"Input sequences are nucleic acid sequences. "
"No translation of fragments.",
),
_Switch(
["-nt", "nt"],
"Input sequences are nucleic acid sequences and "
"'nucleic acid segments' are translated to 'peptide "
"segments'.",
),
_Switch(
["-nta", "nta"],
"'no textual alignment' - textual alignment suppressed. "
"This option makes sense if other output files are of "
"interest -- e.g. the fragment files created with -ff, "
"-fop, -fsm or -lo.",
),
_Switch(
["-o", "o"],
"Fast version, resulting alignments may be slightly different.",
),
_Switch(
["-ow", "ow"],
"Overlap weights enforced (By default, overlap weights "
"are used only if up to 35 sequences are aligned since "
"calculating overlap weights is time consuming).",
),
_Switch(
["-pst", "pst"],
r"'print status'. Creates and updates a file \*.sta with "
"information about the current status of the program "
"run. This option is recommended if large data sets "
"are aligned since it allows the user to estimate the "
"remaining running time.",
),
_Switch(
["-smin", "smin"],
"Minimum similarity value for first residue pair "
"(or codon pair) in fragments. Speeds up protein "
"alignment or alignment of translated DNA fragments "
"at the expense of sensitivity.",
),
_Option(
["-stars", "stars"],
r"Maximum number of '\*' characters indicating degree "
"of local similarity among sequences. By default, no "
"stars are used but numbers between 0 and 9, instead.",
checker_function=lambda x: x in range(10),
equate=False,
),
_Switch(["-stdo", "stdo"], "Results written to standard output."),
_Switch(
["-ta", "ta"],
"Standard textual alignment printed (overrides "
"suppression of textual alignments in special "
"options, e.g. -lgs)",
),
_Option(
["-thr", "thr"],
"Threshold T = x.",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Switch(
["-xfr", "xfr"],
"'exclude fragments' - list of fragments can be "
"specified that are NOT considered for pairwise alignment",
),
_Argument(
["input"],
"Input file name. Must be FASTA format",
filename=True,
is_required=True,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,92 +0,0 @@
# Copyright 2013 by Christian Brueffer. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple sequence alignment program MSAProbs."""
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class MSAProbsCommandline(AbstractCommandline):
"""Command line wrapper for MSAProbs.
http://msaprobs.sourceforge.net
Notes
-----
Last checked against version: 0.9.7
References
----------
Yongchao Liu, Bertil Schmidt, Douglas L. Maskell: "MSAProbs: multiple
sequence alignment based on pair hidden Markov models and partition
function posterior probabilities". Bioinformatics, 2010, 26(16): 1958 -1964
Examples
--------
>>> from Bio.Align.Applications import MSAProbsCommandline
>>> in_file = "unaligned.fasta"
>>> out_file = "aligned.cla"
>>> cline = MSAProbsCommandline(infile=in_file, outfile=out_file, clustalw=True)
>>> print(cline)
msaprobs -o aligned.cla -clustalw unaligned.fasta
You would typically run the command line with cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="msaprobs", **kwargs):
"""Initialize the class."""
# order of parameters is the same as in msaprobs -help
self.parameters = [
_Option(
["-o", "--outfile", "outfile"],
"specify the output file name (STDOUT by default)",
filename=True,
equate=False,
),
_Option(
["-num_threads", "numthreads"],
"specify the number of threads used, and otherwise detect automatically",
checker_function=lambda x: isinstance(x, int),
),
_Switch(
["-clustalw", "clustalw"],
"use CLUSTALW output format instead of FASTA format",
),
_Option(
["-c", "consistency"],
"use 0 <= REPS <= 5 (default: 2) passes of consistency transformation",
checker_function=lambda x: isinstance(x, int) and 0 <= x <= 5,
),
_Option(
["-ir", "--iterative-refinement", "iterative_refinement"],
"use 0 <= REPS <= 1000 (default: 10) passes of iterative-refinement",
checker_function=lambda x: isinstance(x, int) and 0 <= x <= 1000,
),
_Switch(["-v", "verbose"], "report progress while aligning (default: off)"),
_Option(
["-annot", "annot"],
"write annotation for multiple alignment to FILENAME",
filename=True,
),
_Switch(
["-a", "--alignment-order", "alignment_order"],
"print sequences in alignment order rather than input order (default: off)",
),
_Option(["-version", "version"], "print out version of MSAPROBS"),
_Argument(["infile"], "Multiple sequence input file", filename=True),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,437 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment programme MAFFT."""
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class MafftCommandline(AbstractCommandline):
"""Command line wrapper for the multiple alignment program MAFFT.
http://align.bmr.kyushu-u.ac.jp/mafft/software/
Notes
-----
Last checked against version: MAFFT v6.717b (2009/12/03)
References
----------
Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
multiple ncRNA alignment by incorporating structural information into
a MAFFT-based framework (describes RNA structural alignment methods)
Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
developments in the MAFFT multiple sequence alignment program
(outlines version 6)
Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
algorithm to build an approximate tree from a large number of
unaligned sequences (describes the PartTree algorithm)
Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
version 5: improvement in accuracy of multiple sequence alignment
(describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
strategies)
Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
Examples
--------
>>> from Bio.Align.Applications import MafftCommandline
>>> mafft_exe = "/opt/local/mafft"
>>> in_file = "../Doc/examples/opuntia.fasta"
>>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
>>> print(mafft_cline)
/opt/local/mafft ../Doc/examples/opuntia.fasta
If the mafft binary is on the path (typically the case on a Unix style
operating system) then you don't need to supply the executable location:
>>> from Bio.Align.Applications import MafftCommandline
>>> in_file = "../Doc/examples/opuntia.fasta"
>>> mafft_cline = MafftCommandline(input=in_file)
>>> print(mafft_cline)
mafft ../Doc/examples/opuntia.fasta
You would typically run the command line with mafft_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Note that MAFFT will write the alignment to stdout, which you may
want to save to a file and then parse, e.g.::
stdout, stderr = mafft_cline()
with open("aligned.fasta", "w") as handle:
handle.write(stdout)
from Bio import AlignIO
align = AlignIO.read("aligned.fasta", "fasta")
Alternatively, to parse the output with AlignIO directly you can
use StringIO to turn the string into a handle::
stdout, stderr = mafft_cline()
from io import StringIO
from Bio import AlignIO
align = AlignIO.read(StringIO(stdout), "fasta")
"""
def __init__(self, cmd="mafft", **kwargs):
"""Initialize the class."""
BLOSUM_MATRICES = ["30", "45", "62", "80"]
self.parameters = [
# **** Algorithm ****
# Automatically selects an appropriate strategy from L-INS-i, FFT-NS-
# i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
_Switch(["--auto", "auto"], "Automatically select strategy. Default off."),
# Distance is calculated based on the number of shared 6mers. Default: on
_Switch(
["--6merpair", "6merpair", "sixmerpair"],
"Distance is calculated based on the number of shared "
"6mers. Default: on",
),
# All pairwise alignments are computed with the Needleman-Wunsch
# algorithm. More accurate but slower than --6merpair. Suitable for a
# set of globally alignable sequences. Applicable to up to ~200
# sequences. A combination with --maxiterate 1000 is recommended (G-
# INS-i). Default: off (6mer distance is used)
_Switch(
["--globalpair", "globalpair"],
"All pairwise alignments are computed with the "
"Needleman-Wunsch algorithm. Default: off",
),
# All pairwise alignments are computed with the Smith-Waterman
# algorithm. More accurate but slower than --6merpair. Suitable for a
# set of locally alignable sequences. Applicable to up to ~200
# sequences. A combination with --maxiterate 1000 is recommended (L-
# INS-i). Default: off (6mer distance is used)
_Switch(
["--localpair", "localpair"],
"All pairwise alignments are computed with the "
"Smith-Waterman algorithm. Default: off",
),
# All pairwise alignments are computed with a local algorithm with
# the generalized affine gap cost (Altschul 1998). More accurate but
# slower than --6merpair. Suitable when large internal gaps are
# expected. Applicable to up to ~200 sequences. A combination with --
# maxiterate 1000 is recommended (E-INS-i). Default: off (6mer
# distance is used)
_Switch(
["--genafpair", "genafpair"],
"All pairwise alignments are computed with a local "
"algorithm with the generalized affine gap cost "
"(Altschul 1998). Default: off",
),
# All pairwise alignments are computed with FASTA (Pearson and Lipman
# 1988). FASTA is required. Default: off (6mer distance is used)
_Switch(
["--fastapair", "fastapair"],
"All pairwise alignments are computed with FASTA "
"(Pearson and Lipman 1988). Default: off",
),
# Weighting factor for the consistency term calculated from pairwise
# alignments. Valid when either of --blobalpair, --localpair, --
# genafpair, --fastapair or --blastpair is selected. Default: 2.7
_Option(
["--weighti", "weighti"],
"Weighting factor for the consistency term calculated "
"from pairwise alignments. Default: 2.7",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Guide tree is built number times in the progressive stage. Valid
# with 6mer distance. Default: 2
_Option(
["--retree", "retree"],
"Guide tree is built number times in the progressive "
"stage. Valid with 6mer distance. Default: 2",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Number cycles of iterative refinement are performed. Default: 0
_Option(
["--maxiterate", "maxiterate"],
"Number cycles of iterative refinement are performed. Default: 0",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Number of threads to use. Default: 1
_Option(
["--thread", "thread"],
"Number of threads to use. Default: 1",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Use FFT approximation in group-to-group alignment. Default: on
_Switch(
["--fft", "fft"],
"Use FFT approximation in group-to-group alignment. Default: on",
),
# Do not use FFT approximation in group-to-group alignment. Default:
# off
_Switch(
["--nofft", "nofft"],
"Do not use FFT approximation in group-to-group "
"alignment. Default: off",
),
# Alignment score is not checked in the iterative refinement stage.
# Default: off (score is checked)
_Switch(
["--noscore", "noscore"],
"Alignment score is not checked in the iterative "
"refinement stage. Default: off (score is checked)",
),
# Use the Myers-Miller (1988) algorithm. Default: automatically
# turned on when the alignment length exceeds 10,000 (aa/nt).
_Switch(
["--memsave", "memsave"],
"Use the Myers-Miller (1988) algorithm. Default: "
"automatically turned on when the alignment length "
"exceeds 10,000 (aa/nt).",
),
# Use a fast tree-building method (PartTree, Katoh and Toh 2007) with
# the 6mer distance. Recommended for a large number (> ~10,000) of
# sequences are input. Default: off
_Switch(
["--parttree", "parttree"],
"Use a fast tree-building method with the 6mer "
"distance. Default: off",
),
# The PartTree algorithm is used with distances based on DP. Slightly
# more accurate and slower than --parttree. Recommended for a large
# number (> ~10,000) of sequences are input. Default: off
_Switch(
["--dpparttree", "dpparttree"],
"The PartTree algorithm is used with distances "
"based on DP. Default: off",
),
# The PartTree algorithm is used with distances based on FASTA.
# Slightly more accurate and slower than --parttree. Recommended for
# a large number (> ~10,000) of sequences are input. FASTA is
# required. Default: off
_Switch(
["--fastaparttree", "fastaparttree"],
"The PartTree algorithm is used with distances based "
"on FASTA. Default: off",
),
# The number of partitions in the PartTree algorithm. Default: 50
_Option(
["--partsize", "partsize"],
"The number of partitions in the PartTree algorithm. Default: 50",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Do not make alignment larger than number sequences. Valid only with
# the --*parttree options. Default: the number of input sequences
_Switch(
["--groupsize", "groupsize"],
"Do not make alignment larger than number sequences. "
"Default: the number of input sequences",
),
# Adjust direction according to the first sequence
# Mafft V6 beta function
_Switch(
["--adjustdirection", "adjustdirection"],
"Adjust direction according to the first sequence. Default off.",
),
# Adjust direction according to the first sequence
# for highly diverged data; very slow
# Mafft V6 beta function
_Switch(
["--adjustdirectionaccurately", "adjustdirectionaccurately"],
"Adjust direction according to the first sequence,"
"for highly diverged data; very slow"
"Default off.",
),
# **** Parameter ****
# Gap opening penalty at group-to-group alignment. Default: 1.53
_Option(
["--op", "op"],
"Gap opening penalty at group-to-group alignment. Default: 1.53",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Offset value, which works like gap extension penalty, for group-to-
# group alignment. Default: 0.123
_Option(
["--ep", "ep"],
"Offset value, which works like gap extension penalty, "
"for group-to- group alignment. Default: 0.123",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap opening penalty at local pairwise alignment. Valid when the --
# localpair or --genafpair option is selected. Default: -2.00
_Option(
["--lop", "lop"],
"Gap opening penalty at local pairwise alignment. Default: 0.123",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Offset value at local pairwise alignment. Valid when the --
# localpair or --genafpair option is selected. Default: 0.1
_Option(
["--lep", "lep"],
"Offset value at local pairwise alignment. Default: 0.1",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap extension penalty at local pairwise alignment. Valid when the -
# -localpair or --genafpair option is selected. Default: -0.1
_Option(
["--lexp", "lexp"],
"Gap extension penalty at local pairwise alignment. Default: -0.1",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap opening penalty to skip the alignment. Valid when the --
# genafpair option is selected. Default: -6.00
_Option(
["--LOP", "LOP"],
"Gap opening penalty to skip the alignment. Default: -6.00",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap extension penalty to skip the alignment. Valid when the --
# genafpair option is selected. Default: 0.00
_Option(
["--LEXP", "LEXP"],
"Gap extension penalty to skip the alignment. Default: 0.00",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# BLOSUM number matrix (Henikoff and Henikoff 1992) is used.
# number=30, 45, 62 or 80. Default: 62
_Option(
["--bl", "bl"],
"BLOSUM number matrix is used. Default: 62",
checker_function=lambda x: x in BLOSUM_MATRICES,
equate=False,
),
# JTT PAM number (Jones et al. 1992) matrix is used. number>0.
# Default: BLOSUM62
_Option(
["--jtt", "jtt"],
"JTT PAM number (Jones et al. 1992) matrix is used. "
"number>0. Default: BLOSUM62",
equate=False,
),
# Transmembrane PAM number (Jones et al. 1994) matrix is used.
# number>0. Default: BLOSUM62
_Option(
["--tm", "tm"],
"Transmembrane PAM number (Jones et al. 1994) "
"matrix is used. number>0. Default: BLOSUM62",
filename=True, # to ensure spaced inputs are quoted
equate=False,
),
# Use a user-defined AA scoring matrix. The format of matrixfile is
# the same to that of BLAST. Ignored when nucleotide sequences are
# input. Default: BLOSUM62
_Option(
["--aamatrix", "aamatrix"],
"Use a user-defined AA scoring matrix. Default: BLOSUM62",
filename=True, # to ensure spaced inputs are quoted
equate=False,
),
# Incorporate the AA/nuc composition information into the scoring
# matrix. Default: off
_Switch(
["--fmodel", "fmodel"],
"Incorporate the AA/nuc composition information into "
"the scoring matrix (True) or not (False, default)",
),
# **** Output ****
# Name length for CLUSTAL and PHYLIP format output
_Option(
["--namelength", "namelength"],
"""Name length in CLUSTAL and PHYLIP output.
MAFFT v6.847 (2011) added --namelength for use with
the --clustalout option for CLUSTAL output.
MAFFT v7.024 (2013) added support for this with the
--phylipout option for PHYLIP output (default 10).
""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Output format: clustal format. Default: off (fasta format)
_Switch(
["--clustalout", "clustalout"],
"Output format: clustal (True) or fasta (False, default)",
),
# Output format: phylip format.
# Added in beta with v6.847, fixed in v6.850 (2011)
_Switch(
["--phylipout", "phylipout"],
"Output format: phylip (True), or fasta (False, default)",
),
# Output order: same as input. Default: on
_Switch(
["--inputorder", "inputorder"],
"Output order: same as input (True, default) or alignment "
"based (False)",
),
# Output order: aligned. Default: off (inputorder)
_Switch(
["--reorder", "reorder"],
"Output order: aligned (True) or in input order (False, default)",
),
# Guide tree is output to the input.tree file. Default: off
_Switch(
["--treeout", "treeout"],
"Guide tree is output to the input.tree file (True) or "
"not (False, default)",
),
# Do not report progress. Default: off
_Switch(
["--quiet", "quiet"],
"Do not report progress (True) or not (False, default).",
),
# **** Input ****
# Assume the sequences are nucleotide. Default: auto
_Switch(
["--nuc", "nuc"],
"Assume the sequences are nucleotide (True/False). Default: auto",
),
# Assume the sequences are amino acid. Default: auto
_Switch(
["--amino", "amino"],
"Assume the sequences are amino acid (True/False). Default: auto",
),
# MAFFT has multiple --seed commands where the unaligned input is
# aligned to the seed alignment. There can be multiple seeds in the
# form: "mafft --seed align1 --seed align2 [etc] input"
# Effectively for n number of seed alignments.
# TODO - Can we use class _ArgumentList here?
_Option(
["--seed", "seed"],
"Seed alignments given in alignment_n (fasta format) "
"are aligned with sequences in input.",
filename=True,
equate=False,
),
# The input (must be FASTA format)
_Argument(["input"], "Input file name", filename=True, is_required=True),
# mafft-profile takes a second alignment input as an argument:
# mafft-profile align1 align2
_Argument(
["input1"],
"Second input file name for the mafft-profile command",
filename=True,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,686 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program MUSCLE."""
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class MuscleCommandline(AbstractCommandline):
r"""Command line wrapper for the multiple alignment program MUSCLE.
http://www.drive5.com/muscle/
Notes
-----
Last checked against version: 3.7, briefly against 3.8
References
----------
Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.
Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
reduced time and space complexity. BMC Bioinformatics 5(1): 113.
Examples
--------
>>> from Bio.Align.Applications import MuscleCommandline
>>> muscle_exe = r"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe"
>>> in_file = r"C:\My Documents\unaligned.fasta"
>>> out_file = r"C:\My Documents\aligned.fasta"
>>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
>>> print(muscle_cline)
"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta"
You would typically run the command line with muscle_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="muscle", **kwargs):
"""Initialize the class."""
CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
DISTANCE_MEASURES_ITER1 = [
"kmer6_6",
"kmer20_3",
"kmer20_4",
"kbit20_3",
"kmer4_6",
]
DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + [
"pctid_kimura",
"pctid_log",
]
OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]
# The mucleotide arguments for the sequence type parameter in MUSCLE (-seqtype)
# were updated at somepoint in MUSCLE version 3.8. Prior to the update
# 'nucleo' was used for nucleotide. This has been updated to 'rna' and 'dna'. 'nucleo' kept for
# backwards compatibility with older MUSCLE versions.
SEQUENCE_TYPES = ["protein", "rna", "dna", "nucleo", "auto"]
WEIGHTING_SCHEMES = [
"none",
"clustalw",
"henikoff",
"henikoffpb",
"gsc",
"threeway",
]
self.parameters = [
# Can't use "in" as the final alias as this
# is a reserved word in python:
_Option(
["-in", "in", "input"], "Input filename", filename=True, equate=False
),
_Option(["-out", "out"], "Output filename", filename=True, equate=False),
_Switch(
["-diags", "diags"], "Find diagonals (faster for similar sequences)"
),
_Switch(["-profile", "profile"], "Perform a profile alignment"),
_Option(
["-in1", "in1"],
"First input filename for profile alignment",
filename=True,
equate=False,
),
_Option(
["-in2", "in2"],
"Second input filename for a profile alignment",
filename=True,
equate=False,
),
# anchorspacing Integer 32 Minimum spacing
# between anchor cols
_Option(
["-anchorspacing", "anchorspacing"],
"Minimum spacing between anchor columns",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# center Floating point [1] Center parameter.
# Should be negative.
_Option(
["-center", "center"],
"Center parameter - should be negative",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# cluster1 upgma upgmb Clustering method.
_Option(
["-cluster1", "cluster1"],
"Clustering method used in iteration 1",
checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
equate=False,
),
# cluster2 upgmb cluster1 is used
# neighborjoining in iteration 1 and
# 2, cluster2 in
# later iterations.
_Option(
["-cluster2", "cluster2"],
"Clustering method used in iteration 2",
checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
equate=False,
),
# diaglength Integer 24 Minimum length of
# diagonal.
_Option(
["-diaglength", "diaglength"],
"Minimum length of diagonal",
checker_function=lambda x: isinstance(x, int),
equate=True,
),
# diagmargin Integer 5 Discard this many
# positions at ends
# of diagonal.
_Option(
["-diagmargin", "diagmargin"],
"Discard this many positions at ends of diagonal",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# distance1 kmer6_6 Kmer6_6(amino) or Distance measure
# kmer20_3 Kmer4_6(nucleo) for iteration 1
# kmer20_4
# kbit20_3
# kmer4_6
_Option(
["-distance1", "distance1"],
"Distance measure for iteration 1",
checker_function=lambda x: x in DISTANCE_MEASURES_ITER1,
equate=False,
),
# distance2 kmer6_6 pctid_kimura Distance measure
# kmer20_3 for iterations
# kmer20_4 2, 3 ...
# kbit20_3
# pctid_kimura
# pctid_log
_Option(
["-distance2", "distance2"],
"Distance measure for iteration 2",
checker_function=lambda x: x in DISTANCE_MEASURES_ITER2,
equate=False,
),
# gapextend Floating point [1] The gap extend score
_Option(
["-gapextend", "gapextend"],
"Gap extension penalty",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# gapopen Floating point [1] The gap open score
# Must be negative.
_Option(
["-gapopen", "gapopen"],
"Gap open score - negative number",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# hydro Integer 5 Window size for
# determining whether
# a region is
# hydrophobic.
_Option(
["-hydro", "hydro"],
"Window size for hydrophobic region",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# hydrofactor Floating point 1.2 Multiplier for gap
# open/close
# penalties in
# hydrophobic regions
_Option(
["-hydrofactor", "hydrofactor"],
"Multiplier for gap penalties in hydrophobic regions",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# log File name None. Log file name
# (delete existing
# file).
_Option(["-log", "log"], "Log file name", filename=True, equate=False),
# loga File name None. Log file name
# (append to existing
# file).
_Option(
["-loga", "loga"],
"Log file name (append to existing file)",
filename=True,
equate=False,
),
# matrix File name None. File name for
# substitution matrix
# in NCBI or WU-BLAST
# format. If you
# specify your own
# matrix, you should
# also specify:
# -gapopen <g>
# -gapextend <e>
# -center 0.0
_Option(
["-matrix", "matrix"],
"path to NCBI or WU-BLAST format protein substitution "
"matrix - also set -gapopen, -gapextend and -center",
filename=True,
equate=False,
),
# diagbreak Integer 1 Maximum distance
# between two
# diagonals that
# allows them to
# merge into one
# diagonal.
_Option(
["-diagbreak", "diagbreak"],
"Maximum distance between two diagonals that allows "
"them to merge into one diagonal",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8
"Deprecated in v3.8, use -diagbreak instead.",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# maxhours Floating point None. Maximum time to
# run in hours. The
# actual time may
# exceed requested
# limit by a few
# minutes. Decimals
# are allowed, so 1.5
# means one hour and
# 30 minutes.
_Option(
["-maxhours", "maxhours"],
"Maximum time to run in hours",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# maxiters Integer 1, 2 ... 16 Maximum number of
# iterations.
_Option(
["-maxiters", "maxiters"],
"Maximum number of iterations",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# maxtrees Integer 1 Maximum number of
# new trees to build
# in iteration 2.
_Option(
["-maxtrees", "maxtrees"],
"Maximum number of trees to build in iteration 2",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# minbestcolscore Floating point [1] Minimum score a
# column must have to
# be an anchor.
_Option(
["-minbestcolscore", "minbestcolscore"],
"Minimum score a column must have to be an anchor",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# minsmoothscore Floating point [1] Minimum smoothed
# score a column must
# have to be an
# anchor.
_Option(
["-minsmoothscore", "minsmoothscore"],
"Minimum smoothed score a column must have to be an anchor",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# objscore sp spm Objective score
# ps used by tree
# dp dependent
# xp refinement.
# spf sp=sum-of-pairs
# spm score. (dimer
# approximation)
# spm=sp for < 100
# seqs, otherwise spf
# dp=dynamic
# programming score.
# ps=average profile-
# sequence score.
# xp=cross profile
# score.
_Option(
["-objscore", "objscore"],
"Objective score used by tree dependent refinement",
checker_function=lambda x: x in OBJECTIVE_SCORES,
equate=False,
),
# refinewindow Integer 200 Length of window
# for -refinew.
_Option(
["-refinewindow", "refinewindow"],
"Length of window for -refinew",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# root1 pseudo pseudo Method used to root
_Option(
["-root1", "root1"],
"Method used to root tree in iteration 1",
checker_function=lambda x: x in TREE_ROOT_METHODS,
equate=False,
),
# root2 midlongestspan tree; root1 is
# minavgleafdist used in iteration 1
# and 2, root2 in
# later iterations.
_Option(
["-root2", "root2"],
"Method used to root tree in iteration 2",
checker_function=lambda x: x in TREE_ROOT_METHODS,
equate=False,
),
# scorefile File name None File name where to
# write a score file.
# This contains one
# line for each column
# in the alignment.
# The line contains
# the letters in the
# column followed by
# the average BLOSUM62
# score over pairs of
# letters in the
# column.
_Option(
["-scorefile", "scorefile"],
"Score file name, contains one line for each column"
" in the alignment with average BLOSUM62 score",
filename=True,
equate=False,
),
# seqtype protein auto Sequence type.
# dna (MUSCLE version > 3.8)
# rna (MUSCLE version > 3.8)
# auto
# nucleo (only valid for MUSCLE versions < 3.8)
_Option(
["-seqtype", "seqtype"],
"Sequence type",
checker_function=lambda x: x in SEQUENCE_TYPES,
equate=False,
),
# smoothscoreceil Floating point [1] Maximum value of
# column score for
# smoothing purposes.
_Option(
["-smoothscoreceil", "smoothscoreceil"],
"Maximum value of column score for smoothing",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# smoothwindow Integer 7 Window used for
# anchor column
# smoothing.
_Option(
["-smoothwindow", "smoothwindow"],
"Window used for anchor column smoothing",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# spscore File name Compute SP
# objective score of
# multiple alignment.
_Option(
["-spscore", "spscore"],
"Compute SP objective score of multiple alignment",
filename=True,
equate=False,
),
# SUEFF Floating point value 0.1 Constant used in
# between 0 and 1. UPGMB clustering.
# Determines the
# relative fraction
# of average linkage
# (SUEFF) vs. nearest
# neighbor linkage
# (1 SUEFF).
_Option(
["-sueff", "sueff"],
"Constant used in UPGMB clustering",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# tree1 File name None Save tree
_Option(
["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False
),
# tree2 first or second
# iteration to given
# file in Newick
# (Phylip-compatible)
# format.
_Option(
["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False
),
# usetree File name None Use given tree as
# guide tree. Must by
# in Newick
# (Phyip-compatible)
# format.
_Option(
["-usetree", "usetree"],
"Use given Newick tree as guide tree",
filename=True,
equate=False,
),
# weight1 none clustalw Sequence weighting
_Option(
["-weight1", "weight1"],
"Weighting scheme used in iteration 1",
checker_function=lambda x: x in WEIGHTING_SCHEMES,
equate=False,
),
# weight2 henikoff scheme.
# henikoffpb weight1 is used in
# gsc iterations 1 and 2.
# clustalw weight2 is used for
# threeway tree-dependent
# refinement.
# none=all sequences
# have equal weight.
# henikoff=Henikoff &
# Henikoff weighting
# scheme.
# henikoffpb=Modified
# Henikoff scheme as
# used in PSI-BLAST.
# clustalw=CLUSTALW
# method.
# threeway=Gotoh
# three-way method.
_Option(
["-weight2", "weight2"],
"Weighting scheme used in iteration 2",
checker_function=lambda x: x in WEIGHTING_SCHEMES,
equate=False,
),
# ################### FORMATS ####################################
# Multiple formats can be specified on the command line
# If -msf appears it will be used regardless of other formats
# specified. If -clw appears (and not -msf), clustalw format will
# be used regardless of other formats specified. If both -clw and
# -clwstrict are specified -clwstrict will be used regardless of
# other formats specified. If -fasta is specified and not -msf,
# -clw, or clwstrict, fasta will be used. If -fasta and -html are
# specified -fasta will be used. Only if -html is specified alone
# will html be used. I kid ye not.
# clw no Write output in CLUSTALW format
# (default is FASTA).
_Switch(
["-clw", "clw"],
"Write output in CLUSTALW format (with a MUSCLE header)",
),
# clwstrict no Write output in CLUSTALW format with
# the "CLUSTAL W (1.81)" header rather
# than the MUSCLE version. This is
# useful when a post-processing step is
# picky about the file header.
_Switch(
["-clwstrict", "clwstrict"],
"Write output in CLUSTALW format with version 1.81 header",
),
# fasta yes Write output in FASTA format.
# Alternatives include clw,
# clwstrict, msf and html.
_Switch(["-fasta", "fasta"], "Write output in FASTA format"),
# html no Write output in HTML format (default
# is FASTA).
_Switch(["-html", "html"], "Write output in HTML format"),
# msf no Write output in MSF format (default
# is FASTA).
_Switch(["-msf", "msf"], "Write output in MSF format"),
# Phylip interleaved - undocumented as of 3.7
_Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"),
# Phylip sequential - undocumented as of 3.7
_Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"),
# ################# Additional specified output files #########
_Option(
["-phyiout", "phyiout"],
"Write PHYLIP interleaved output to specified filename",
filename=True,
equate=False,
),
_Option(
["-physout", "physout"],
"Write PHYLIP sequential format to specified filename",
filename=True,
equate=False,
),
_Option(
["-htmlout", "htmlout"],
"Write HTML output to specified filename",
filename=True,
equate=False,
),
_Option(
["-clwout", "clwout"],
"Write CLUSTALW output (with MUSCLE header) to specified filename",
filename=True,
equate=False,
),
_Option(
["-clwstrictout", "clwstrictout"],
"Write CLUSTALW output (with version 1.81 header) to "
"specified filename",
filename=True,
equate=False,
),
_Option(
["-msfout", "msfout"],
"Write MSF format output to specified filename",
filename=True,
equate=False,
),
_Option(
["-fastaout", "fastaout"],
"Write FASTA format output to specified filename",
filename=True,
equate=False,
),
# ############# END FORMATS ###################################
# anchors yes Use anchor optimization in tree
# dependent refinement iterations.
_Switch(
["-anchors", "anchors"],
"Use anchor optimisation in tree dependent refinement iterations",
),
# noanchors no Disable anchor optimization. Default
# is anchors.
_Switch(
["-noanchors", "noanchors"],
"Do not use anchor optimisation in tree dependent "
"refinement iterations",
),
# brenner no Use Steven Brenner's method for
# computing the root alignment.
_Switch(
["-brenner", "brenner"], "Use Steve Brenner's root alignment method"
),
# cluster no Perform fast clustering of input
# sequences. Use the tree1 option to
# save the tree.
_Switch(
["-cluster", "cluster"],
"Perform fast clustering of input sequences, "
"use -tree1 to save tree",
),
# dimer no Use dimer approximation for the
# SP score (faster, less accurate).
_Switch(
["-dimer", "dimer"],
"Use faster (slightly less accurate) dimer approximation"
"for the SP score",
),
# group yes Group similar sequences together
# in the output. This is the default.
# See also stable.
_Switch(["-group", "group"], "Group similar sequences in output"),
# ############# log-expectation profile score ####################
# One of either -le, -sp, or -sv
#
# According to the doc, spn is default and the only option for
# nucleotides: this doesn't appear to be true. -le, -sp, and -sv
# can be used and produce numerically different logs
# (what is going on?)
#
# spn fails on proteins
# le maybe Use log-expectation profile score
# (VTML240). Alternatives are to use sp
# or sv. This is the default for amino
# acid sequences.
_Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"),
# sv no Use sum-of-pairs profile score
# (VTML240). Default is le.
_Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"),
# sp no Use sum-of-pairs protein profile
# score (PAM200). Default is le.
_Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"),
# spn maybe Use sum-of-pairs nucleotide profile
# score (BLASTZ parameters). This is
# the only option for nucleotides,
# and is therefore the default.
_Switch(
["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score"
),
# ########## END log-expectation profile score ###################
# quiet no Do not display progress messages.
_Switch(["-quiet", "quiet"], "Do not display progress messages"),
# refine no Input file is already aligned, skip
# first two iterations and begin tree
# dependent refinement.
_Switch(["-refine", "refine"], "Only do tree dependent refinement"),
# refinew no Refine an alignment by dividing it
# into non-overlapping windows and
# re-aligning each window. Typically
# used for whole-genome nucleotide
# alignments.
_Switch(
["-refinew", "refinew"],
"Only do tree dependent refinement using sliding window approach",
),
# core yes in muscle, Do not catch exceptions.
# no in muscled.
_Switch(["-core", "core"], "Do not catch exceptions"),
# nocore no in muscle, Catch exceptions and give an
# yes in muscled. error message if possible.
_Switch(["-nocore", "nocore"], "Catch exceptions"),
# stable no Preserve input order of sequences
# in output file. Default is to group
# sequences by similarity (group).
_Switch(
["-stable", "stable"],
"Do not group similar sequences in output (not supported in v3.8)",
),
# termgaps4 yes Use 4-way test for treatment of
# terminal gaps.
# (Cannot be disabled in this version).
#
# termgapsfull no Terminal gaps penalized with
# full penalty. [1] Not fully
# supported in this version
#
# termgapshalf yes Terminal gaps penalized with
# half penalty. [1] Not fully
# supported in this version
#
# termgapshalflonger no Terminal gaps penalized with
# half penalty if gap relative
# to longer sequence, otherwise with
# full penalty. [1] Not fully
# supported in this version
#
# verbose no Write parameter settings and
# progress messages to log file.
_Switch(["-verbose", "verbose"], "Write parameter settings and progress"),
# version no Write version string to
# stdout and exit
_Switch(["-version", "version"], "Write version string to stdout and exit"),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,238 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program PRANK."""
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class PrankCommandline(AbstractCommandline):
"""Command line wrapper for the multiple alignment program PRANK.
http://www.ebi.ac.uk/goldman-srv/prank/prank/
Notes
-----
Last checked against version: 081202
References
----------
Loytynoja, A. and Goldman, N. 2005. An algorithm for progressive
multiple alignment of sequences with insertions. Proceedings of
the National Academy of Sciences, 102: 10557--10562.
Loytynoja, A. and Goldman, N. 2008. Phylogeny-aware gap placement
prevents errors in sequence alignment and evolutionary analysis.
Science, 320: 1632.
Examples
--------
To align a FASTA file (unaligned.fasta) with the output in aligned
FASTA format with the output filename starting with "aligned" (you
can't pick the filename explicitly), no tree output and no XML output,
use:
>>> from Bio.Align.Applications import PrankCommandline
>>> prank_cline = PrankCommandline(d="unaligned.fasta",
... o="aligned", # prefix only!
... f=8, # FASTA output
... notree=True, noxml=True)
>>> print(prank_cline)
prank -d=unaligned.fasta -o=aligned -f=8 -noxml -notree
You would typically run the command line with prank_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="prank", **kwargs):
"""Initialize the class."""
OUTPUT_FORMAT_VALUES = list(range(1, 18))
self.parameters = [
# ################# input/output parameters: ##################
# -d=sequence_file
_Option(["-d", "d"], "Input filename", filename=True, is_required=True),
# -t=tree_file [default: no tree, generate approximate NJ tree]
_Option(["-t", "t"], "Input guide tree filename", filename=True),
# -tree="tree_string" [tree in newick format; in double quotes]
_Option(["-tree", "tree"], "Input guide tree as Newick string"),
# -m=model_file [default: HKY2/WAG]
_Option(
["-m", "m"], "User-defined alignment model filename. Default: HKY2/WAG"
),
# -o=output_file [default: 'output']
_Option(
["-o", "o"],
"Output filenames prefix. Default: 'output'\n "
"Will write: output.?.fas (depending on requested "
"format), output.?.xml and output.?.dnd",
filename=True,
),
# -f=output_format [default: 8]
_Option(
["-f", "f"],
"Output alignment format. Default: 8 FASTA\n"
"Option are:\n"
"1. IG/Stanford 8. Pearson/Fasta\n"
"2. GenBank/GB 11. Phylip3.2\n"
"3. NBRF 12. Phylip\n"
"4. EMBL 14. PIR/CODATA\n"
"6. DNAStrider 15. MSF\n"
"7. Fitch 17. PAUP/NEXUS",
checker_function=lambda x: x in OUTPUT_FORMAT_VALUES,
),
_Switch(
["-noxml", "noxml"],
"Do not output XML files (PRANK versions earlier than v.120626)",
),
_Switch(
["-notree", "notree"],
"Do not output dnd tree files (PRANK versions earlier than v.120626)",
),
_Switch(
["-showxml", "showxml"], "Output XML files (PRANK v.120626 and later)"
),
_Switch(
["-showtree", "showtree"],
"Output dnd tree files (PRANK v.120626 and later)",
),
_Switch(["-shortnames", "shortnames"], "Truncate names at first space"),
_Switch(["-quiet", "quiet"], "Reduce verbosity"),
# ###################### model parameters: ######################
# +F [force insertions to be always skipped]
# -F [equivalent]
_Switch(
["-F", "+F", "F"], "Force insertions to be always skipped: same as +F"
),
# -dots [show insertion gaps as dots]
_Switch(["-dots", "dots"], "Show insertion gaps as dots"),
# -gaprate=# [gap opening rate; default: dna 0.025 / prot 0.0025]
_Option(
["-gaprate", "gaprate"],
"Gap opening rate. Default: dna 0.025 prot 0.0025",
checker_function=lambda x: isinstance(x, float),
),
# -gapext=# [gap extension probability; default: dna 0.5 / prot 0.5]
_Option(
["-gapext", "gapext"],
"Gap extension probability. Default: dna 0.5 / prot 0.5",
checker_function=lambda x: isinstance(x, float),
),
# -dnafreqs=#,#,#,# [ACGT; default: empirical]
_Option(
["-dnafreqs", "dnafreqs"],
"DNA frequencies - 'A,C,G,T'. eg '25,25,25,25' as a quote "
"surrounded string value. Default: empirical",
checker_function=lambda x: isinstance(x, bytes),
),
# -kappa=# [ts/tv rate ratio; default:2]
_Option(
["-kappa", "kappa"],
"Transition/transversion ratio. Default: 2",
checker_function=lambda x: isinstance(x, int),
),
# -rho=# [pur/pyr rate ratio; default:1]
_Option(
["-rho", "rho"],
"Purine/pyrimidine ratio. Default: 1",
checker_function=lambda x: isinstance(x, int),
),
# -codon [for DNA: use empirical codon model]
_Switch(["-codon", "codon"], "Codon aware alignment or not"),
# -termgap [penalise terminal gaps normally]
_Switch(["-termgap", "termgap"], "Penalise terminal gaps normally"),
# ############### other parameters: ################################
# -nopost [do not compute posterior support; default: compute]
_Switch(
["-nopost", "nopost"],
"Do not compute posterior support. Default: compute",
),
# -pwdist=# [expected pairwise distance for computing guidetree;
# default: dna 0.25 / prot 0.5]
_Option(
["-pwdist", "pwdist"],
"Expected pairwise distance for computing guidetree. "
"Default: dna 0.25 / prot 0.5",
checker_function=lambda x: isinstance(x, float),
),
_Switch(
["-once", "once"], "Run only once. Default: twice if no guidetree given"
),
_Switch(["-twice", "twice"], "Always run twice"),
_Switch(["-skipins", "skipins"], "Skip insertions in posterior support"),
_Switch(
["-uselogs", "uselogs"],
"Slower but should work for a greater number of sequences",
),
_Switch(["-writeanc", "writeanc"], "Output ancestral sequences"),
_Switch(
["-printnodes", "printnodes"], "Output each node; mostly for debugging"
),
# -matresize=# [matrix resizing multiplier]
# Doesn't specify type but Float and Int work
_Option(
["-matresize", "matresize"],
"Matrix resizing multiplier",
checker_function=lambda x: (isinstance(x, (float, int))),
),
# -matinitsize=# [matrix initial size multiplier]
# Doesn't specify type but Float and Int work
_Option(
["-matinitsize", "matinitsize"],
"Matrix initial size multiplier",
checker_function=lambda x: (isinstance(x, (float, int))),
),
_Switch(["-longseq", "longseq"], "Save space in pairwise alignments"),
_Switch(["-pwgenomic", "pwgenomic"], "Do pairwise alignment, no guidetree"),
# -pwgenomicdist=# [distance for pairwise alignment; default: 0.3]
_Option(
["-pwgenomicdist", "pwgenomicdist"],
"Distance for pairwise alignment. Default: 0.3",
checker_function=lambda x: isinstance(x, float),
),
# -scalebranches=# [scale branch lengths; default: dna 1 / prot 2]
_Option(
["-scalebranches", "scalebranches"],
"Scale branch lengths. Default: dna 1 / prot 2",
checker_function=lambda x: isinstance(x, int),
),
# -fixedbranches=# [use fixed branch lengths]
# Assume looking for a float
_Option(
["-fixedbranches", "fixedbranches"],
"Use fixed branch lengths of input value",
checker_function=lambda x: isinstance(x, float),
),
# -maxbranches=# [set maximum branch length]
# Assume looking for a float
_Option(
["-maxbranches", "maxbranches"],
"Use maximum branch lengths of input value",
checker_function=lambda x: isinstance(x, float),
),
# -realbranches [disable branch length truncation]
_Switch(
["-realbranches", "realbranches"], "Disable branch length truncation"
),
_Switch(["-translate", "translate"], "Translate to protein"),
_Switch(
["-mttranslate", "mttranslate"], "Translate to protein using mt table"
),
# ##################### other: ####################
_Switch(
["-convert", "convert"],
"Convert input alignment to new format. Do not perform alignment",
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,140 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program PROBCONS."""
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class ProbconsCommandline(AbstractCommandline):
"""Command line wrapper for the multiple alignment program PROBCONS.
http://probcons.stanford.edu/
Notes
-----
Last checked against version: 1.12
References
----------
Do, C.B., Mahabhashyam, M.S.P., Brudno, M., and Batzoglou, S. 2005.
PROBCONS: Probabilistic Consistency-based Multiple Sequence Alignment.
Genome Research 15: 330-340.
Examples
--------
To align a FASTA file (unaligned.fasta) with the output in ClustalW
format, and otherwise default settings, use:
>>> from Bio.Align.Applications import ProbconsCommandline
>>> probcons_cline = ProbconsCommandline(input="unaligned.fasta",
... clustalw=True)
>>> print(probcons_cline)
probcons -clustalw unaligned.fasta
You would typically run the command line with probcons_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Note that PROBCONS will write the alignment to stdout, which you may
want to save to a file and then parse, e.g.::
stdout, stderr = probcons_cline()
with open("aligned.aln", "w") as handle:
handle.write(stdout)
from Bio import AlignIO
align = AlignIO.read("aligned.fasta", "clustalw")
Alternatively, to parse the output with AlignIO directly you can
use StringIO to turn the string into a handle::
stdout, stderr = probcons_cline()
from io import StringIO
from Bio import AlignIO
align = AlignIO.read(StringIO(stdout), "clustalw")
"""
def __init__(self, cmd="probcons", **kwargs):
"""Initialize the class."""
self.parameters = [
# Note that some options cannot be assigned via properties using the
# original documented option (because hyphens are not valid for names in
# python), e.g cmdline.pre-training = 3 will not work
# In these cases the shortened option name should be used
# cmdline.pre = 3
_Switch(
["-clustalw", "clustalw"], "Use CLUSTALW output format instead of MFA"
),
_Option(
["-c", "c", "--consistency", "consistency"],
"Use 0 <= REPS <= 5 (default: 2) passes of consistency transformation",
checker_function=lambda x: x in range(6),
equate=False,
),
_Option(
["-ir", "--iterative-refinement", "iterative-refinement", "ir"],
"Use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement",
checker_function=lambda x: x in range(1001),
equate=False,
),
_Option(
["-pre", "--pre-training", "pre-training", "pre"],
"Use 0 <= REPS <= 20 (default: 0) rounds of pretraining",
checker_function=lambda x: x in range(21),
equate=False,
),
_Switch(["-pairs", "pairs"], "Generate all-pairs pairwise alignments"),
_Switch(
["-viterbi", "viterbi"],
"Use Viterbi algorithm to generate all pairs "
"(automatically enables -pairs)",
),
_Switch(
["-verbose", "verbose"], "Report progress while aligning (default: off)"
),
_Option(
["-annot", "annot"],
"Write annotation for multiple alignment to FILENAME",
equate=False,
),
_Option(
["-t", "t", "--train", "train"],
"Compute EM transition probabilities, store in FILENAME "
"(default: no training)",
equate=False,
),
_Switch(
["-e", "e", "--emissions", "emissions"],
"Also reestimate emission probabilities (default: off)",
),
_Option(
["-p", "p", "--paramfile", "paramfile"],
"Read parameters from FILENAME",
equate=False,
),
_Switch(
["-a", "--alignment-order", "alignment-order", "a"],
"Print sequences in alignment order rather than input "
"order (default: off)",
),
# Input file name
_Argument(
["input"],
"Input file name. Must be multiple FASTA alignment (MFA) format",
filename=True,
is_required=True,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,126 +0,0 @@
# Copyright 2009 by Cymon J. Cox and Brad Chapman. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program TCOFFEE."""
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class TCoffeeCommandline(AbstractCommandline):
"""Commandline object for the TCoffee alignment program.
http://www.tcoffee.org/Projects_home_page/t_coffee_home_page.html
The T-Coffee command line tool has a lot of switches and options.
This wrapper implements a VERY limited number of options - if you
would like to help improve it please get in touch.
Notes
-----
Last checked against: Version_6.92
References
----------
T-Coffee: A novel method for multiple sequence alignments.
Notredame, Higgins, Heringa, JMB,302(205-217) 2000
Examples
--------
To align a FASTA file (unaligned.fasta) with the output in ClustalW
format (file aligned.aln), and otherwise default settings, use:
>>> from Bio.Align.Applications import TCoffeeCommandline
>>> tcoffee_cline = TCoffeeCommandline(infile="unaligned.fasta",
... output="clustalw",
... outfile="aligned.aln")
>>> print(tcoffee_cline)
t_coffee -output clustalw -infile unaligned.fasta -outfile aligned.aln
You would typically run the command line with tcoffee_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
SEQ_TYPES = ["dna", "protein", "dna_protein"]
def __init__(self, cmd="t_coffee", **kwargs):
"""Initialize the class."""
self.parameters = [
_Option(
["-output", "output"],
"""Specify the output type.
One (or more separated by a comma) of:
'clustalw_aln', 'clustalw', 'gcg', 'msf_aln',
'pir_aln', 'fasta_aln', 'phylip', 'pir_seq', 'fasta_seq'
""",
equate=False,
),
_Option(
["-infile", "infile"],
"Specify the input file.",
filename=True,
is_required=True,
equate=False,
),
# Indicates the name of the alignment output by t_coffee. If the
# default is used, the alignment is named <your sequences>.aln
_Option(
["-outfile", "outfile"],
"Specify the output file. Default: <your sequences>.aln",
filename=True,
equate=False,
),
_Switch(
["-convert", "convert"], "Specify you want to perform a file conversion"
),
_Option(
["-type", "type"],
"Specify the type of sequence being aligned",
checker_function=lambda x: x in self.SEQ_TYPES,
equate=False,
),
_Option(
["-outorder", "outorder"],
"Specify the order of sequence to output"
"Either 'input', 'aligned' or <filename> of "
"Fasta file with sequence order",
equate=False,
),
_Option(
["-matrix", "matrix"],
"Specify the filename of the substitution matrix to use. "
"Default: blosum62mt",
equate=False,
),
_Option(
["-gapopen", "gapopen"],
"Indicates the penalty applied for opening a gap (negative integer)",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-gapext", "gapext"],
"Indicates the penalty applied for extending a gap (negative integer)",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Switch(["-quiet", "quiet"], "Turn off log output"),
_Option(
["-mode", "mode"],
"Specifies a special mode: genome, quickaln, dali, 3dcoffee",
equate=False,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,34 +0,0 @@
# Copyright 2009 by Peter Cock & Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Alignment command line tool wrappers (OBSOLETE).
We have decided to remove this module in future, and instead recommend
building your command and invoking it via the subprocess module directly.
"""
from ._ClustalOmega import ClustalOmegaCommandline
from ._Clustalw import ClustalwCommandline
from ._Dialign import DialignCommandline
from ._Mafft import MafftCommandline
from ._MSAProbs import MSAProbsCommandline
from ._Muscle import MuscleCommandline
from ._Prank import PrankCommandline
from ._Probcons import ProbconsCommandline
from ._TCoffee import TCoffeeCommandline
# Make this explicit, then they show up in the API docs
__all__ = (
"MuscleCommandline",
"ClustalwCommandline",
"ClustalOmegaCommandline",
"PrankCommandline",
"MafftCommandline",
"DialignCommandline",
"ProbconsCommandline",
"TCoffeeCommandline",
"MSAProbsCommandline",
)

View File

@ -1,855 +0,0 @@
# Copyright 2001-2004 Brad Chapman.
# Revisions copyright 2009-2013 by Peter Cock.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""General mechanisms to access applications in Biopython (DEPRECATED).
This module is not intended for direct use. It provides the basic objects which
are subclassed by our command line wrappers, such as:
- Bio.Align.Applications
- Bio.Blast.Applications
- Bio.Emboss.Applications
- Bio.Sequencing.Applications
These modules provide wrapper classes for command line tools to help you
construct command line strings by setting the values of each parameter.
The finished command line strings are then normally invoked via the built-in
Python module subprocess.
Due to the on going maintenance burden or keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove them.
We instead now recommend building your command line and invoking it directly
with the subprocess module.
"""
import os
import platform
import re
import subprocess
import sys
import warnings
from Bio import BiopythonDeprecationWarning
warnings.warn(
"""\
The Bio.Application modules and modules relying on it have been deprecated.
Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.
We instead now recommend building your command line and invoking it directly
with the subprocess module.""",
BiopythonDeprecationWarning,
)
# Use this regular expression to test the property names are going to
# be valid as Python properties or arguments
_re_prop_name = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]*$")
assert _re_prop_name.match("t")
assert _re_prop_name.match("test")
assert _re_prop_name.match("_test") is None # we don't want private names
assert _re_prop_name.match("-test") is None
assert _re_prop_name.match("any-hyphen") is None
assert _re_prop_name.match("underscore_ok")
assert _re_prop_name.match("test_name")
assert _re_prop_name.match("test2")
# These are reserved names in Python itself,
_reserved_names = [
"and",
"del",
"from",
"not",
"while",
"as",
"elif",
"global",
"or",
"with",
"assert",
"else",
"if",
"pass",
"yield",
"break",
"except",
"import",
"print",
"class",
"exec",
"in",
"raise",
"continue",
"finally",
"is",
"return",
"def",
"for",
"lambda",
"try",
]
# These are reserved names due to the way the wrappers work
_local_reserved_names = ["set_parameter"]
class ApplicationError(subprocess.CalledProcessError):
"""Raised when an application returns a non-zero exit status (OBSOLETE).
The exit status will be stored in the returncode attribute, similarly
the command line string used in the cmd attribute, and (if captured)
stdout and stderr as strings.
This exception is a subclass of subprocess.CalledProcessError.
>>> err = ApplicationError(-11, "helloworld", "", "Some error text")
>>> err.returncode, err.cmd, err.stdout, err.stderr
(-11, 'helloworld', '', 'Some error text')
>>> print(err)
Non-zero return code -11 from 'helloworld', message 'Some error text'
"""
def __init__(self, returncode, cmd, stdout="", stderr=""):
"""Initialize the class."""
self.returncode = returncode
self.cmd = cmd
self.stdout = stdout
self.stderr = stderr
def __str__(self):
"""Format the error as a string."""
# get first line of any stderr message
try:
msg = self.stderr.lstrip().split("\n", 1)[0].rstrip()
except Exception: # TODO, ValueError? AttributeError?
msg = ""
if msg:
return "Non-zero return code %d from %r, message %r" % (
self.returncode,
self.cmd,
msg,
)
else:
return "Non-zero return code %d from %r" % (self.returncode, self.cmd)
def __repr__(self):
"""Represent the error as a string."""
return "ApplicationError(%i, %s, %s, %s)" % (
self.returncode,
self.cmd,
self.stdout,
self.stderr,
)
class AbstractCommandline:
r"""Generic interface for constructing command line strings (OBSOLETE).
This class shouldn't be called directly; it should be subclassed to
provide an implementation for a specific application.
For a usage example we'll show one of the EMBOSS wrappers. You can set
options when creating the wrapper object using keyword arguments - or
later using their corresponding properties:
>>> from Bio.Emboss.Applications import WaterCommandline
>>> cline = WaterCommandline(gapopen=10, gapextend=0.5)
>>> cline
WaterCommandline(cmd='water', gapopen=10, gapextend=0.5)
You can instead manipulate the parameters via their properties, e.g.
>>> cline.gapopen
10
>>> cline.gapopen = 20
>>> cline
WaterCommandline(cmd='water', gapopen=20, gapextend=0.5)
You can clear a parameter you have already added by 'deleting' the
corresponding property:
>>> del cline.gapopen
>>> cline.gapopen
>>> cline
WaterCommandline(cmd='water', gapextend=0.5)
Once you have set the parameters you need, you can turn the object into
a string (e.g. to log the command):
>>> str(cline)
Traceback (most recent call last):
...
ValueError: You must either set outfile (output filename), or enable filter or stdout (output to stdout).
In this case the wrapper knows certain arguments are required to construct
a valid command line for the tool. For a complete example,
>>> from Bio.Emboss.Applications import WaterCommandline
>>> water_cmd = WaterCommandline(gapopen=10, gapextend=0.5)
>>> water_cmd.asequence = "asis:ACCCGGGCGCGGT"
>>> water_cmd.bsequence = "asis:ACCCGAGCGCGGT"
>>> water_cmd.outfile = "temp_water.txt"
>>> print(water_cmd)
water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
>>> water_cmd
WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5)
You would typically run the command line via a standard Python operating
system call using the subprocess module for full control. For the simple
case where you just want to run the command and get the output:
stdout, stderr = water_cmd()
Note that by default we assume the underlying tool is installed on the
system $PATH environment variable. This is normal under Linux/Unix, but
may need to be done manually under Windows. Alternatively, you can specify
the full path to the binary as the first argument (cmd):
>>> from Bio.Emboss.Applications import WaterCommandline
>>> water_cmd = WaterCommandline(r"C:\Program Files\EMBOSS\water.exe",
... gapopen=10, gapextend=0.5,
... asequence="asis:ACCCGGGCGCGGT",
... bsequence="asis:ACCCGAGCGCGGT",
... outfile="temp_water.txt")
>>> print(water_cmd)
"C:\Program Files\EMBOSS\water.exe" -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
Notice that since the path name includes a space it has automatically
been quoted.
"""
# TODO - Replace the above example since EMBOSS doesn't work properly
# if installed into a folder with a space like "C:\Program Files\EMBOSS"
#
# Note the call example above is not a doctest as we can't handle EMBOSS
# (or any other tool) being missing in the unit tests.
parameters = None # will be a list defined in subclasses
def __init__(self, cmd, **kwargs):
"""Create a new instance of a command line wrapper object."""
# Init method - should be subclassed!
#
# The subclass methods should look like this:
#
# def __init__(self, cmd="muscle", **kwargs):
# self.parameters = [...]
# AbstractCommandline.__init__(self, cmd, **kwargs)
#
# i.e. There should have an optional argument "cmd" to set the location
# of the executable (with a sensible default which should work if the
# command is on the path on Unix), and keyword arguments. It should
# then define a list of parameters, all objects derived from the base
# class _AbstractParameter.
#
# The keyword arguments should be any valid parameter name, and will
# be used to set the associated parameter.
self.program_name = cmd
try:
parameters = self.parameters
except AttributeError:
raise AttributeError(
"Subclass should have defined self.parameters"
) from None
# Create properties for each parameter at run time
aliases = set()
for p in parameters:
if not p.names:
if not isinstance(p, _StaticArgument):
raise TypeError(f"Expected {p!r} to be of type _StaticArgument")
continue
for name in p.names:
if name in aliases:
raise ValueError(f"Parameter alias {name} multiply defined")
aliases.add(name)
name = p.names[-1]
if _re_prop_name.match(name) is None:
raise ValueError(
"Final parameter name %r cannot be used as "
"an argument or property name in python" % name
)
if name in _reserved_names:
raise ValueError(
"Final parameter name %r cannot be used as "
"an argument or property name because it is "
"a reserved word in python" % name
)
if name in _local_reserved_names:
raise ValueError(
"Final parameter name %r cannot be used as "
"an argument or property name due to the "
"way the AbstractCommandline class works" % name
)
# Beware of binding-versus-assignment confusion issues
def getter(name):
return lambda x: x._get_parameter(name)
def setter(name):
return lambda x, value: x.set_parameter(name, value)
def deleter(name):
return lambda x: x._clear_parameter(name)
doc = p.description
if isinstance(p, _Switch):
doc += (
"\n\nThis property controls the addition of the %s "
"switch, treat this property as a boolean." % p.names[0]
)
else:
doc += (
"\n\nThis controls the addition of the %s parameter "
"and its associated value. Set this property to the "
"argument value required." % p.names[0]
)
prop = property(getter(name), setter(name), deleter(name), doc)
setattr(self.__class__, name, prop) # magic!
for key, value in kwargs.items():
self.set_parameter(key, value)
def _validate(self):
"""Make sure the required parameters have been set (PRIVATE).
No return value - it either works or raises a ValueError.
This is a separate method (called from __str__) so that subclasses may
override it.
"""
for p in self.parameters:
# Check for missing required parameters:
if p.is_required and not (p.is_set):
raise ValueError(f"Parameter {p.names[-1]} is not set.")
# Also repeat the parameter validation here, just in case?
def __str__(self):
"""Make the commandline string with the currently set options.
e.g.
>>> from Bio.Emboss.Applications import WaterCommandline
>>> cline = WaterCommandline(gapopen=10, gapextend=0.5)
>>> cline.asequence = "asis:ACCCGGGCGCGGT"
>>> cline.bsequence = "asis:ACCCGAGCGCGGT"
>>> cline.outfile = "temp_water.txt"
>>> print(cline)
water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
>>> str(cline)
'water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5'
"""
self._validate()
commandline = f"{_escape_filename(self.program_name)} "
for parameter in self.parameters:
if parameter.is_set:
# This will include a trailing space:
commandline += str(parameter)
return commandline.strip() # remove trailing space
def __repr__(self):
"""Return a representation of the command line object for debugging.
e.g.
>>> from Bio.Emboss.Applications import WaterCommandline
>>> cline = WaterCommandline(gapopen=10, gapextend=0.5)
>>> cline.asequence = "asis:ACCCGGGCGCGGT"
>>> cline.bsequence = "asis:ACCCGAGCGCGGT"
>>> cline.outfile = "temp_water.txt"
>>> print(cline)
water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
>>> cline
WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5)
"""
answer = f"{self.__class__.__name__}(cmd={self.program_name!r}"
for parameter in self.parameters:
if parameter.is_set:
if isinstance(parameter, _Switch):
answer += f", {parameter.names[-1]}=True"
else:
answer += f", {parameter.names[-1]}={parameter.value!r}"
answer += ")"
return answer
def _get_parameter(self, name):
"""Get a commandline option value (PRIVATE)."""
for parameter in self.parameters:
if name in parameter.names:
if isinstance(parameter, _Switch):
return parameter.is_set
else:
return parameter.value
raise ValueError(f"Option name {name} was not found.")
def _clear_parameter(self, name):
"""Reset or clear a commandline option value (PRIVATE)."""
cleared_option = False
for parameter in self.parameters:
if name in parameter.names:
parameter.value = None
parameter.is_set = False
cleared_option = True
if not cleared_option:
raise ValueError(f"Option name {name} was not found.")
def set_parameter(self, name, value=None):
"""Set a commandline option for a program (OBSOLETE).
Every parameter is available via a property and as a named
keyword when creating the instance. Using either of these is
preferred to this legacy set_parameter method which is now
OBSOLETE, and likely to be DEPRECATED and later REMOVED in
future releases.
"""
set_option = False
for parameter in self.parameters:
if name in parameter.names:
if isinstance(parameter, _Switch):
if value is None:
import warnings
warnings.warn(
"For a switch type argument like %s, "
"we expect a boolean. None is treated "
"as FALSE!" % parameter.names[-1]
)
parameter.is_set = bool(value)
set_option = True
else:
if value is not None:
self._check_value(value, name, parameter.checker_function)
parameter.value = value
parameter.is_set = True
set_option = True
if not set_option:
raise ValueError(f"Option name {name} was not found.")
def _check_value(self, value, name, check_function):
"""Check whether the given value is valid (PRIVATE).
No return value - it either works or raises a ValueError.
This uses the passed function 'check_function', which can either
return a [0, 1] (bad, good) value or raise an error. Either way
this function will raise an error if the value is not valid, or
finish silently otherwise.
"""
if check_function is not None:
is_good = check_function(value) # May raise an exception
if is_good not in [0, 1, True, False]:
raise ValueError(
f"Result of check_function: {is_good!r} is of an unexpected value"
)
if not is_good:
raise ValueError(
f"Invalid parameter value {value!r} for parameter {name}"
)
def __setattr__(self, name, value):
"""Set attribute name to value (PRIVATE).
This code implements a workaround for a user interface issue.
Without this __setattr__ attribute-based assignment of parameters
will silently accept invalid parameters, leading to known instances
of the user assuming that parameters for the application are set,
when they are not.
>>> from Bio.Emboss.Applications import WaterCommandline
>>> cline = WaterCommandline(gapopen=10, gapextend=0.5, stdout=True)
>>> cline.asequence = "a.fasta"
>>> cline.bsequence = "b.fasta"
>>> cline.csequence = "c.fasta"
Traceback (most recent call last):
...
ValueError: Option name csequence was not found.
>>> print(cline)
water -stdout -asequence=a.fasta -bsequence=b.fasta -gapopen=10 -gapextend=0.5
This workaround uses a whitelist of object attributes, and sets the
object attribute list as normal, for these. Other attributes are
assumed to be parameters, and passed to the self.set_parameter method
for validation and assignment.
"""
if name in ["parameters", "program_name"]: # Allowed attributes
self.__dict__[name] = value
else:
self.set_parameter(name, value) # treat as a parameter
def __call__(self, stdin=None, stdout=True, stderr=True, cwd=None, env=None):
"""Execute command, wait for it to finish, return (stdout, stderr).
Runs the command line tool and waits for it to finish. If it returns
a non-zero error level, an exception is raised. Otherwise two strings
are returned containing stdout and stderr.
The optional stdin argument should be a string of data which will be
passed to the tool as standard input.
The optional stdout and stderr argument may be filenames (string),
but otherwise are treated as a booleans, and control if the output
should be captured as strings (True, default), or ignored by sending
it to /dev/null to avoid wasting memory (False). If sent to a file
or ignored, then empty string(s) are returned.
The optional cwd argument is a string giving the working directory
to run the command from. See Python's subprocess module documentation
for more details.
The optional env argument is a dictionary setting the environment
variables to be used in the new process. By default the current
process' environment variables are used. See Python's subprocess
module documentation for more details.
Default example usage::
from Bio.Emboss.Applications import WaterCommandline
water_cmd = WaterCommandline(gapopen=10, gapextend=0.5,
stdout=True, auto=True,
asequence="a.fasta", bsequence="b.fasta")
print("About to run: %s" % water_cmd)
std_output, err_output = water_cmd()
This functionality is similar to subprocess.check_output(). In general
if you require more control over running the command, use subprocess
directly.
When the program called returns a non-zero error level, a custom
ApplicationError exception is raised. This includes any stdout and
stderr strings captured as attributes of the exception object, since
they may be useful for diagnosing what went wrong.
"""
if not stdout:
stdout_arg = open(os.devnull, "w")
elif isinstance(stdout, str):
stdout_arg = open(stdout, "w")
else:
stdout_arg = subprocess.PIPE
if not stderr:
stderr_arg = open(os.devnull, "w")
elif isinstance(stderr, str):
if stdout == stderr:
stderr_arg = stdout_arg # Write both to the same file
else:
stderr_arg = open(stderr, "w")
else:
stderr_arg = subprocess.PIPE
# We may not need to supply any piped input, but we setup the
# standard input pipe anyway as a work around for a python
# bug if this is called from a Windows GUI program. For
# details, see http://bugs.python.org/issue1124861
#
# Using universal newlines is important on Python 3, this
# gives unicode handles rather than bytes handles.
# Windows 7, 8, 8.1 and 10 want shell = True
if sys.platform != "win32":
use_shell = True
else:
win_ver = platform.win32_ver()[0]
if win_ver in ["7", "8", "post2012Server", "10"]:
use_shell = True
else:
use_shell = False
child_process = subprocess.Popen(
str(self),
stdin=subprocess.PIPE,
stdout=stdout_arg,
stderr=stderr_arg,
universal_newlines=True,
cwd=cwd,
env=env,
shell=use_shell,
)
# Use .communicate as can get deadlocks with .wait(), see Bug 2804
stdout_str, stderr_str = child_process.communicate(stdin)
if not stdout:
assert not stdout_str, stdout_str
if not stderr:
assert not stderr_str, stderr_str
return_code = child_process.returncode
# Particularly important to close handles on Jython and PyPy
# (where garbage collection is less predictable) and on Windows
# (where cannot delete files with an open handle):
if not stdout or isinstance(stdout, str):
# We opened /dev/null or a file
stdout_arg.close()
if not stderr or (isinstance(stderr, str) and stdout != stderr):
# We opened /dev/null or a file
stderr_arg.close()
if return_code:
raise ApplicationError(return_code, str(self), stdout_str, stderr_str)
return stdout_str, stderr_str
class _AbstractParameter:
"""A class to hold information about a parameter for a commandline.
Do not use this directly, instead use one of the subclasses.
"""
def __init__(self):
raise NotImplementedError
def __str__(self):
raise NotImplementedError
class _Option(_AbstractParameter):
"""Represent an option that can be set for a program.
This holds UNIXish options like --append=yes and -a yes,
where a value (here "yes") is generally expected.
For UNIXish options like -kimura in clustalw which don't
take a value, use the _Switch object instead.
Attributes:
- names -- a list of string names (typically two entries) by which
the parameter can be set via the legacy set_parameter method
(eg ["-a", "--append", "append"]). The first name in list is used
when building the command line. The last name in the list is a
"human readable" name describing the option in one word. This
must be a valid Python identifier as it is used as the property
name and as a keyword argument, and should therefore follow PEP8
naming.
- description -- a description of the option. This is used as
the property docstring.
- filename -- True if this argument is a filename (or other argument
that should be quoted) and should be automatically quoted if it
contains spaces.
- checker_function -- a reference to a function that will determine
if a given value is valid for this parameter. This function can either
raise an error when given a bad value, or return a [0, 1] decision on
whether the value is correct.
- equate -- should an equals sign be inserted if a value is used?
- is_required -- a flag to indicate if the parameter must be set for
the program to be run.
- is_set -- if the parameter has been set
- value -- the value of a parameter
"""
def __init__(
self,
names,
description,
filename=False,
checker_function=None,
is_required=False,
equate=True,
):
self.names = names
if not isinstance(description, str):
raise TypeError(f"Should be a string: {description!r} for {names[-1]}")
# Note 'filename' is for any string with spaces that needs quoting
self.is_filename = filename
self.checker_function = checker_function
self.description = description
self.equate = equate
self.is_required = is_required
self.is_set = False
self.value = None
def __str__(self):
"""Return the value of this option for the commandline.
Includes a trailing space.
"""
# Note: Before equate was handled explicitly, the old
# code would do either "--name " or "--name=value ",
# or " -name " or " -name value ". This choice is now
# now made explicitly when setting up the option.
if self.value is None:
return f"{self.names[0]} "
if self.is_filename:
v = _escape_filename(self.value)
else:
v = str(self.value)
if self.equate:
return f"{self.names[0]}={v} "
else:
return f"{self.names[0]} {v} "
class _Switch(_AbstractParameter):
"""Represent an optional argument switch for a program.
This holds UNIXish options like -kimura in clustalw which don't
take a value, they are either included in the command string
or omitted.
Attributes:
- names -- a list of string names (typically two entries) by which
the parameter can be set via the legacy set_parameter method
(eg ["-a", "--append", "append"]). The first name in list is used
when building the command line. The last name in the list is a
"human readable" name describing the option in one word. This
must be a valid Python identifier as it is used as the property
name and as a keyword argument, and should therefore follow PEP8
naming.
- description -- a description of the option. This is used as
the property docstring.
- is_set -- if the parameter has been set
NOTE - There is no value attribute, see is_set instead,
"""
def __init__(self, names, description):
self.names = names
self.description = description
self.is_set = False
self.is_required = False
def __str__(self):
"""Return the value of this option for the commandline.
Includes a trailing space.
"""
assert not hasattr(self, "value")
if self.is_set:
return f"{self.names[0]} "
else:
return ""
class _Argument(_AbstractParameter):
"""Represent an argument on a commandline.
The names argument should be a list containing one string.
This must be a valid Python identifier as it is used as the
property name and as a keyword argument, and should therefore
follow PEP8 naming.
"""
def __init__(
self,
names,
description,
filename=False,
checker_function=None,
is_required=False,
):
# if len(names) != 1:
# raise ValueError("The names argument to _Argument should be a "
# "single entry list with a PEP8 property name.")
self.names = names
if not isinstance(description, str):
raise TypeError(f"Should be a string: {description!r} for {names[-1]}")
# Note 'filename' is for any string with spaces that needs quoting
self.is_filename = filename
self.checker_function = checker_function
self.description = description
self.is_required = is_required
self.is_set = False
self.value = None
def __str__(self):
if self.value is None:
return " "
elif self.is_filename:
return f"{_escape_filename(self.value)} "
else:
return f"{self.value} "
class _ArgumentList(_Argument):
"""Represent a variable list of arguments on a command line, e.g. multiple filenames."""
# TODO - Option to require at least one value? e.g. min/max count?
def __str__(self):
if not isinstance(self.value, list):
raise TypeError("Arguments should be a list")
if not self.value:
raise ValueError("Requires at least one filename")
# A trailing space is required so that parameters following the last filename
# do not appear merged.
# e.g.: samtools cat in1.bam in2.bam-o out.sam [without trailing space][Incorrect]
# samtools cat in1.bam in2.bam -o out.sam [with trailing space][Correct]
if self.is_filename:
return " ".join(_escape_filename(v) for v in self.value) + " "
else:
return " ".join(self.value) + " "
class _StaticArgument(_AbstractParameter):
"""Represent a static (read only) argument on a commandline.
This is not intended to be exposed as a named argument or
property of a command line wrapper object.
"""
def __init__(self, value):
self.names = []
self.is_required = False
self.is_set = True
self.value = value
def __str__(self):
return f"{self.value} "
def _escape_filename(filename):
"""Escape filenames with spaces by adding quotes (PRIVATE).
Note this will not add quotes if they are already included:
>>> print((_escape_filename('example with spaces')))
"example with spaces"
>>> print((_escape_filename('"example with spaces"')))
"example with spaces"
>>> print((_escape_filename(1)))
1
Note the function is more generic than the name suggests, since it
is used to add quotes around any string arguments containing spaces.
"""
# Is adding the following helpful
# if os.path.isfile(filename):
# # On Windows, if the file exists, we can ask for
# # its alternative short name (DOS style 8.3 format)
# # which has no spaces in it. Note that this name
# # is not portable between machines, or even folder!
# try:
# import win32api
# short = win32api.GetShortPathName(filename)
# assert os.path.isfile(short)
# return short
# except ImportError:
# pass
if not isinstance(filename, str):
# for example the NCBI BLAST+ -outfmt argument can be an integer
return filename
if " " not in filename:
return filename
# We'll just quote it - works on Windows, Mac OS X etc
if filename.startswith('"') and filename.endswith('"'):
# Its already quoted
return filename
else:
return f'"{filename}"'
def _test():
"""Run the Bio.Application module's doctests (PRIVATE)."""
import doctest
doctest.testmod(verbose=1)
if __name__ == "__main__":
# Run the doctests
_test()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,600 +0,0 @@
# Copyright 2013 by Nate Sutton.
# Based on code in _Phyml.py by Eric Talevich.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command-line wrapper for tree inference program Fasttree."""
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
def _is_int(x):
"""Test whether the argument can be serialized as an integer (PRIVATE)."""
return isinstance(x, int) or str(x).isdigit()
def _is_numeric(x):
"""Test whether the argument can be serialized as a number (PRIVATE)."""
try:
float(str(x))
return True
except ValueError:
return False
class FastTreeCommandline(AbstractCommandline):
r"""Command-line wrapper for FastTree.
Only the ``input`` and ``out`` parameters are mandatory.
From the terminal command line use ``fasttree.exe -help`` or ``fasttree.exe -expert``
for more explanation of usage options.
Homepage: http://www.microbesonline.org/fasttree/
References
----------
Price, M.N., Dehal, P.S., and Arkin, A.P. (2010) FastTree 2 -- Approximately
Maximum-Likelihood Trees for Large Alignments. PLoS ONE, 5(3):e9490.
https://doi.org/10.1371/journal.pone.0009490.
Examples
--------
This is an example on Windows::
import _Fasttree
fasttree_exe = r"C:\FasttreeWin32\fasttree.exe"
cmd = _Fasttree.FastTreeCommandline(fasttree_exe,
... input=r'C:\Input\ExampleAlignment.fsa',
... out=r'C:\Output\ExampleTree.tree')
print(cmd)
out, err = cmd()
print(out)
print(err)
"""
def __init__(self, cmd="fasttree", **kwargs):
"""Initialize the class."""
self.parameters = [
_Switch(
["-nt", "nt"],
"By default FastTree expects protein alignments, use -nt for nucleotides",
),
_Option(
["-n", "n"],
"""-n -- read N multiple alignments in.
This only works with phylip interleaved format. For example, you can
use it with the output from phylip's seqboot. If you use -n, FastTree
will write 1 tree per line to standard output.
""",
checker_function=_is_int,
equate=False,
),
_Switch(
["-quote", "quote"],
"""-quote -- add quotes to sequence names in output.
Quote sequence names in the output and allow spaces, commas,
parentheses, and colons in them but not ' characters (fasta files only).
""",
),
_Option(
["-pseudo", "pseudo"],
"""-pseudo [weight] -- Pseudocounts are used with sequence distance estimation.
Use pseudocounts to estimate distances between sequences with little or no
overlap. (Off by default.) Recommended if analyzing the alignment has
sequences with little or no overlap.
If the weight is not specified, it is 1.0
""",
checker_function=_is_numeric,
equate=False,
),
_Option(
["-boot", "boot"],
"""Specify the number of resamples for support values.
Support value options:
By default, FastTree computes local support values by resampling the site
likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome,
it will compute minimum-evolution bootstrap supports instead
In either case, the support values are proportions ranging from 0 to 1
Use -nosupport to turn off support values or -boot 100 to use just 100 resamples.
""",
checker_function=_is_int,
equate=False,
),
_Switch(
["-nosupport", "nosupport"],
"""Turn off support values.
Support value options:
By default, FastTree computes local support values by resampling the site
likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome,
it will compute minimum-evolution bootstrap supports instead
In either case, the support values are proportions ranging from 0 to 1
Use -nosupport to turn off support values or -boot 100 to use just 100 resamples.
""",
),
_Option(
["-intree", "intree"],
"""-intree newickfile -- read the starting tree in from newickfile.
Any branch lengths in the starting trees are ignored.
-intree with -n will read a separate starting tree for each alignment.
""",
filename=True,
equate=False,
),
_Option(
["-intree1", "intree1"],
"intree1 newickfile -- read the same starting tree for each alignment.",
filename=True,
equate=False,
),
_Switch(
["-quiet", "quiet"],
"""-quiet -- do not write to standard error during normal operation
(no progress indicator, no options summary, no likelihood values, etc.)
""",
),
_Switch(
["-nopr", "nopr"],
"-nopr -- do not write the progress indicator to stderr.",
),
_Option(
["-nni", "nni"],
"""Set the rounds of minimum-evolution nearest-neighbor interchanges
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs.
""",
checker_function=_is_int,
equate=False,
),
_Option(
["-spr", "spr"],
"""Set the rounds of subtree-prune-regraft moves
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
""",
checker_function=_is_int,
equate=False,
),
_Switch(
["-noml", "noml"],
"""Deactivate min-evo NNIs and SPRs.
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
Use -noml to turn off both min-evo NNIs and SPRs (useful if refining
an approximately maximum-likelihood tree with further NNIs).
""",
),
_Switch(
["-mllen", "mllen"],
"""Optimize branch lengths on a fixed topology.
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
Use -mllen to optimize branch lengths without ML NNIs
Use -mllen -nome with -intree to optimize branch lengths on a fixed topology.
""",
),
_Switch(
["-nome", "nome"],
"""Changes support values calculation to a minimum-evolution bootstrap method.
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
Use -mllen to optimize branch lengths without ML NNIs
Use -mllen -nome with -intree to optimize branch lengths on a fixed topology
Support value options:
By default, FastTree computes local support values by resampling the site
likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome,
it will compute minimum-evolution bootstrap supports instead
In either case, the support values are proportions ranging from 0 to 1.
""",
),
_Option(
["-mlnni", "mlnni"],
"""Set the number of rounds of maximum-likelihood NNIs.
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
Use -mlnni to set the number of rounds of maximum-likelihood NNIs.
""",
checker_function=_is_int,
equate=False,
),
_Option(
["-mlacc", "mlacc"],
"""Option for optimization of branches at each NNI.
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI,
and to optimize all 5 branches in 2 or 3 rounds.
""",
checker_function=_is_int,
equate=False,
),
_Switch(
["-slownni", "slownni"],
"""Turn off heuristics to avoid constant subtrees with NNIs.
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
Use -slownni to turn off heuristics to avoid constant subtrees
(affects both ML and ME NNIs).
""",
),
_Switch(
["-wag", "wag"],
"""Maximum likelihood model options.
Whelan-And-Goldman 2001 model instead of (default)
Jones-Taylor-Thorton 1992 model (a.a. only)
""",
),
_Switch(
["-gtr", "gtr"],
"""Maximum likelihood model options.
Use generalized time-reversible instead of (default)
Jukes-Cantor (nt only)
""",
),
_Option(
["-cat", "cat"],
"""Maximum likelihood model options.
Specify the number of rate categories of sites (default 20).""",
checker_function=_is_int,
equate=False,
),
_Switch(
["-nocat", "nocat"],
"Maximum likelihood model options: No CAT model (just 1 category)",
),
_Switch(
["-gamma", "gamma"],
"""Report the likelihood under the discrete gamma model.
Maximum likelihood model options:
-gamma -- after the final round of optimizing branch lengths with the CAT model,
report the likelihood under the discrete gamma model with the same
number of categories. FastTree uses the same branch lengths but
optimizes the gamma shape parameter and the scale of the lengths.
The final tree will have rescaled lengths. Used with -log, this
also generates per-site likelihoods for use with CONSEL, see
GammaLogToPaup.pl and documentation on the FastTree web site.
""",
),
_Switch(
["-slow", "slow"],
"""Use an exhaustive search.
Searching for the best join:
By default, FastTree combines the 'visible set' of fast neighbor-joining with
local hill-climbing as in relaxed neighbor-joining
-slow -- exhaustive search (like NJ or BIONJ, but different gap handling)
-slow takes half an hour instead of 8 seconds for 1,250 proteins
""",
),
_Switch(
["-fastest", "fastest"],
"""Search the visible set (the top hit for each node) only.
Searching for the best join:
By default, FastTree combines the 'visible set' of fast neighbor-joining with
local hill-climbing as in relaxed neighbor-joining
-fastest -- search the visible set (the top hit for each node) only
Unlike the original fast neighbor-joining, -fastest updates visible(C)
after joining A and B if join(AB,C) is better than join(C,visible(C))
-fastest also updates out-distances in a very lazy way,
-fastest sets -2nd on as well, use -fastest -no2nd to avoid this
""",
),
_Switch(
["-2nd", "second"],
"""Turn 2nd-level top hits heuristic on.
Top-hit heuristics:
By default, FastTree uses a top-hit list to speed up search
Use -notop (or -slow) to turn this feature off
and compare all leaves to each other,
and all new joined nodes to each other
-2nd or -no2nd to turn 2nd-level top hits heuristic on or off
This reduces memory usage and running time but may lead to
marginal reductions in tree quality.
(By default, -fastest turns on -2nd.)
""",
),
_Switch(
["-no2nd", "no2nd"],
"""Turn 2nd-level top hits heuristic off.
Top-hit heuristics:
By default, FastTree uses a top-hit list to speed up search
Use -notop (or -slow) to turn this feature off
and compare all leaves to each other,
and all new joined nodes to each other
-2nd or -no2nd to turn 2nd-level top hits heuristic on or off
This reduces memory usage and running time but may lead to
marginal reductions in tree quality.
(By default, -fastest turns on -2nd.)
""",
),
_Option(
["-seed", "seed"],
"""Use -seed to initialize the random number generator.
Support value options:
By default, FastTree computes local support values by resampling the site
likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome,
it will compute minimum-evolution bootstrap supports instead
In either case, the support values are proportions ranging from 0 to 1.
""",
checker_function=_is_int,
equate=False,
),
_Switch(
["-top", "top"],
"""Top-hit list to speed up search
Top-hit heuristics:
By default, FastTree uses a top-hit list to speed up search
Use -notop (or -slow) to turn this feature off
and compare all leaves to each other,
and all new joined nodes to each other.
""",
),
_Switch(
["-notop", "notop"],
"""Turn off top-hit list to speed up search
Top-hit heuristics:
By default, FastTree uses a top-hit list to speed up search
Use -notop (or -slow) to turn this feature off
and compare all leaves to each other,
and all new joined nodes to each other.
""",
),
_Option(
["-topm", "topm"],
"""Change the top hits calculation method
Top-hit heuristics:
By default, FastTree uses a top-hit list to speed up search
-topm 1.0 -- set the top-hit list size to parameter*sqrt(N)
FastTree estimates the top m hits of a leaf from the
top 2*m hits of a 'close' neighbor, where close is
defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m),
and updates the top-hits as joins proceed.
""",
checker_function=_is_numeric,
equate=False,
),
_Option(
["-close", "close"],
"""Modify the close heuristic for the top-hit list
Top-hit heuristics:
By default, FastTree uses a top-hit list to speed up search
-close 0.75 -- modify the close heuristic, lower is more conservative.
""",
checker_function=_is_numeric,
equate=False,
),
_Option(
["-refresh", "refresh"],
"""Parameter for conditions that joined nodes are compared to other nodes
Top-hit heuristics:
By default, FastTree uses a top-hit list to speed up search
-refresh 0.8 -- compare a joined node to all other nodes if its
top-hit list is less than 80% of the desired length,
or if the age of the top-hit list is log2(m) or greater.
""",
checker_function=_is_numeric,
equate=False,
),
_Option(
["-matrix", "matrix"],
"""Specify a matrix for nucleotide or amino acid distances
Distances:
Default: For protein sequences, log-corrected distances and an
amino acid dissimilarity matrix derived from BLOSUM45
or for nucleotide sequences, Jukes-Cantor distances
To specify a different matrix, use -matrix FilePrefix or -nomatrix
""",
filename=True,
equate=False,
),
_Switch(
["-nomatrix", "nomatrix"],
"""Specify that no matrix should be used for nucleotide or amino acid distances
Distances:
Default: For protein sequences, log-corrected distances and an
amino acid dissimilarity matrix derived from BLOSUM45
or for nucleotide sequences, Jukes-Cantor distances
To specify a different matrix, use -matrix FilePrefix or -nomatrix
""",
),
_Switch(
["-nj", "nj"],
"Join options: regular (unweighted) neighbor-joining (default)",
),
_Switch(
["-bionj", "bionj"],
"""Join options: weighted joins as in BIONJ.
FastTree will also weight joins during NNIs.
""",
),
_Option(
["-gtrrates", "gtrrates"], "-gtrrates ac ag at cg ct gt", equate=False
),
_Option(["-gtrfreq", "gtrfreq"], "-gtrfreq A C G T", equate=False),
_Option(
["-constraints", "constraints"],
"""Specifies an alignment file for use with constrained topology searching
Constrained topology search options:
-constraints alignmentfile -- an alignment with values of 0, 1, and -
Not all sequences need be present. A column of 0s and 1s defines a
constrained split. Some constraints may be violated
(see 'violating constraints:' in standard error).
""",
filename=True,
equate=False,
),
_Option(
["-constraintWeight", "constraintWeight"],
"""Weight strength of constraints in topology searching.
Constrained topology search options:
-constraintWeight -- how strongly to weight the constraints. A value of 1
means a penalty of 1 in tree length for violating a constraint
Default: 100.0
""",
checker_function=_is_numeric,
equate=False,
),
_Option(
["-log", "log"],
"""Create log files of data such as intermediate trees and per-site rates
-log logfile -- save intermediate trees so you can extract
the trees and restart long-running jobs if they crash
-log also reports the per-site rates (1 means slowest category).
""",
filename=True,
equate=False,
),
_Option(
["-makematrix", "makematrix"],
"-makematrix [alignment]",
filename=True,
equate=False,
),
_Switch(
["-rawdist", "rawdist"],
"""Turn off or adjust log-correction in AA or NT distances.
Use -rawdist to turn the log-correction off or to use
%different instead of Jukes-Cantor in AA or NT distances
Distances:
Default: For protein sequences, log-corrected distances and an
amino acid dissimilarity matrix derived from BLOSUM45
or for nucleotide sequences, Jukes-Cantor distances
To specify a different matrix, use -matrix FilePrefix or -nomatrix
""",
),
_Option(
["-sprlength", "sprlength"],
"""Set maximum SPR move length in topology refinement (default 10).
Topology refinement:
By default, FastTree tries to improve the tree with up to 4*log2(N)
rounds of minimum-evolution nearest-neighbor interchanges (NNI),
where N is the number of unique sequences, 2 rounds of
subtree-prune-regraft (SPR) moves (also min. evo.), and
up to 2*log(N) rounds of maximum-likelihood NNIs.
Use -nni to set the number of rounds of min. evo. NNIs,
and -spr to set the rounds of SPRs.
""",
checker_function=_is_int,
equate=False,
),
_Switch(["-help", "help"], "Show the help."),
_Switch(["-expert", "expert"], "Show the expert level help."),
_Option(
["-out", "out"],
"""Enter <output file>
The path to a Newick Tree output file needs to be specified.
""",
filename=True,
equate=False,
),
_Argument(
["input"],
"""Enter <input file>
An input file of sequence alignments in fasta or phylip format
is needed. By default FastTree expects protein
alignments, use -nt for nucleotides.
""",
filename=True,
is_required=True,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)

View File

@ -1,291 +0,0 @@
# Copyright 2011 by Eric Talevich. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command-line wrapper for the tree inference program PhyML."""
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class PhymlCommandline(AbstractCommandline):
"""Command-line wrapper for the tree inference program PhyML.
Homepage: http://www.atgc-montpellier.fr/phyml
References
----------
Guindon S, Gascuel O.
A simple, fast, and accurate algorithm to estimate large phylogenies by maximum
likelihood.
Systematic Biology, 2003 Oct;52(5):696-704.
PubMed PMID: 14530136.
Guindon S, Dufayard JF, Lefort V, Anisimova M, Hordijk W, Gascuel O.
New Algorithms and Methods to Estimate Maximum-Likelihood Phylogenies: Assessing
the Performance of PhyML 3.0.
Systematic Biology, 2010 59(3):307-21.
"""
def __init__(self, cmd="phyml", **kwargs):
"""Initialize the class."""
self.parameters = [
_Option(
["-i", "--input", "input"],
"PHYLIP format input nucleotide or amino-acid sequence filenam.",
filename=True,
is_required=True,
equate=False,
),
_Option(
["-d", "--datatype", "datatype"],
"Datatype 'nt' for nucleotide (default) or 'aa' for amino-acids.",
checker_function=lambda x: x in ("nt", "aa"),
equate=False,
),
_Switch(
["-q", "--sequential", "sequential"],
"Changes interleaved format (default) to sequential format.",
),
_Option(
["-n", "--multiple", "multiple"],
"Number of data sets to analyse (integer).",
checker_function=(lambda x: isinstance(x, int) or x.isdigit()),
equate=False,
),
_Switch(
["-p", "--pars", "pars"],
"""Use a minimum parsimony starting tree.
This option is taken into account when the '-u' option is absent
and when tree topology modifications are to be done.
""",
),
_Option(
["-b", "--bootstrap", "bootstrap"],
r"""Number of bootstrap replicates, if value is > 0.
Otherwise:
0: neither approximate likelihood ratio test nor bootstrap
values are computed.
-1: approximate likelihood ratio test returning aLRT statistics.
-2: approximate likelihood ratio test returning Chi2-based
parametric branch supports.
-4: SH-like branch supports alone.
""",
equate=False,
),
_Option(
["-m", "--model", "model"],
"""Substitution model name.
Nucleotide-based models:
HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR | custom
For the custom option, a string of six digits identifies the
model. For instance, 000000 corresponds to F81 (or JC69,
provided the distribution of nucleotide frequencies is uniform).
012345 corresponds to GTR. This option can be used for encoding
any model that is a nested within GTR.
Amino-acid based models:
LG (default) | WAG | JTT | MtREV | Dayhoff | DCMut | RtREV |
CpREV | VT | Blosum62 | MtMam | MtArt | HIVw | HIVb | custom
""",
checker_function=(
lambda x: x
in (
# Nucleotide models:
"HKY85",
"JC69",
"K80",
"F81",
"F84",
"TN93",
"GTR",
# Amino acid models:
"LG",
"WAG",
"JTT",
"MtREV",
"Dayhoff",
"DCMut",
"RtREV",
"CpREV",
"VT",
"Blosum62",
"MtMam",
"MtArt",
"HIVw",
"HIVb",
)
or isinstance(x, int)
),
equate=False,
),
_Option(
["-f", "frequencies"],
"""Character frequencies.
-f e, m, or "fA fC fG fT"
e : Empirical frequencies, determined as follows :
- Nucleotide sequences: (Empirical) the equilibrium base
frequencies are estimated by counting the occurrence
of the different bases in the alignment.
- Amino-acid sequences: (Empirical) the equilibrium
amino-acid frequencies are estimated by counting the
occurrence of the different amino-acids in the alignment.
m : ML/model-based frequencies, determined as follows :
- Nucleotide sequences: (ML) the equilibrium base
frequencies are estimated using maximum likelihood
- Amino-acid sequences: (Model) the equilibrium amino-acid
frequencies are estimated using the frequencies defined by
the substitution model.
"fA fC fG fT" : only valid for nucleotide-based models.
fA, fC, fG and fT are floating-point numbers that correspond
to the frequencies of A, C, G and T, respectively.
""",
filename=True, # ensure ".25 .25 .25 .25" stays quoted
equate=False,
),
_Option(
["-t", "--ts/tv", "ts_tv_ratio"],
"""Transition/transversion ratio. (DNA sequences only.)
Can be a fixed positive value (ex:4.0) or e to get the
maximum-likelihood estimate.
""",
equate=False,
),
_Option(
["-v", "--pinv", "prop_invar"],
"""Proportion of invariable sites.
Can be a fixed value in the range [0,1], or 'e' to get the
maximum-likelihood estimate.
""",
equate=False,
),
_Option(
["-c", "--nclasses", "nclasses"],
"""Number of relative substitution rate categories.
Default 1. Must be a positive integer.
""",
equate=False,
),
_Option(
["-a", "--alpha", "alpha"],
"""Distribution of the gamma distribution shape parameter.
Can be a fixed positive value, or 'e' to get the
maximum-likelihood estimate.
""",
equate=False,
),
_Option(
["-s", "--search", "search"],
"""Tree topology search operation option.
Can be one of:
NNI : default, fast
SPR : a bit slower than NNI
BEST : best of NNI and SPR search
""",
checker_function=lambda x: x in ("NNI", "SPR", "BEST"),
equate=False,
),
# alt name: user_tree_file
_Option(
["-u", "--inputtree", "input_tree"],
"Starting tree filename. The tree must be in Newick format.",
filename=True,
equate=False,
),
_Option(
["-o", "optimize"],
r"""Specific parameter optimisation.
tlr : tree topology (t), branch length (l) and
rate parameters (r) are optimised.
tl : tree topology and branch length are optimised.
lr : branch length and rate parameters are optimised.
l : branch length are optimised.
r : rate parameters are optimised.
n : no parameter is optimised.
""",
equate=False,
),
_Switch(
["--rand_start", "rand_start"],
"""Sets the initial tree to random.
Only valid if SPR searches are to be performed.
""",
),
_Option(
["--n_rand_starts", "n_rand_starts"],
"""Number of initial random trees to be used.
Only valid if SPR searches are to be performed.
""",
equate=False,
),
_Option(
["--r_seed", "r_seed"],
"""Seed used to initiate the random number generator.
Must be an integer.
""",
equate=False,
),
_Switch(
["--print_site_lnl", "print_site_lnl"],
r"Print the likelihood for each site in file \*_phyml_lk.txt.",
),
_Switch(
["--print_trace", "print_trace"],
r"""
Print each phylogeny explored during the tree search process
in file \*_phyml_trace.txt.""",
),
_Option(
["--run_id", "run_id"],
"""Append the given string at the end of each PhyML output file.
This option may be useful when running simulations involving
PhyML.
""",
checker_function=lambda x: isinstance(x, str),
equate=False,
),
# XXX should this always be set to True?
_Switch(
["--quiet", "quiet"],
"No interactive questions (for running in batch mode).",
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)

View File

@ -1,406 +0,0 @@
# Copyright 2012 by Eric Talevich. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command-line wrapper for the tree inference program RAxML.
Derived from the help page for RAxML version 7.3 by Alexandros Stamatakis, but
should work for any version 7.X (and probably earlier for most options).
"""
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class RaxmlCommandline(AbstractCommandline):
"""Command-line wrapper for the tree inference program RAxML.
The required parameters are 'sequences' (-s), 'model' (-m) and 'name' (-n).
The parameter 'parsimony_seed' (-p) must also be set for RAxML, but if you
do not specify it, this wrapper will set the seed to 10000 for you.
References
----------
Stamatakis A.
RAxML-VI-HPC: Maximum Likelihood-based Phylogenetic Analyses with
Thousands of Taxa and Mixed Models.
Bioinformatics 2006, 22(21):2688-2690.
Homepage: http://sco.h-its.org/exelixis/software.html
Examples
--------
>>> from Bio.Phylo.Applications import RaxmlCommandline
>>> raxml_cline = RaxmlCommandline(sequences="Tests/Phylip/interlaced2.phy",
... model="PROTCATWAG", name="interlaced2")
>>> print(raxml_cline)
raxmlHPC -m PROTCATWAG -n interlaced2 -p 10000 -s Tests/Phylip/interlaced2.phy
You would typically run the command line with raxml_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="raxmlHPC", **kwargs):
"""Initialize the class."""
self.parameters = [
_Option(
["-a", "weight_filename"],
"Name of a column weight file to assign individual weights "
"to each column of the alignment. Those weights must be "
"integers separated by any type and number of whitespaces "
"within a separate file.",
filename=True,
equate=False,
),
_Option(
["-b", "bootstrap_seed"], "Random seed for bootstrapping.", equate=False
),
_Option(
["-c", "num_categories"],
"Number of distinct rate categories for RAxML when "
"evolution model is set to GTRCAT or GTRMIX."
"Individual per-site rates are categorized into this "
"many rate categories to accelerate computations. "
"Default: 25.",
equate=False,
),
_Switch(
["-d", "random_starting_tree"],
"Start ML optimization from random starting tree.",
),
_Option(
["-e", "epsilon"],
"Set model optimization precision in log likelihood units "
"for final optimization of tree topology under MIX/MIXI "
"or GAMMA/GAMMAI."
"Default: 0.1 for models not using proportion of "
"invariant sites estimate; 0.001 for models using "
"proportion of invariant sites estimate.",
equate=False,
),
_Option(
["-E", "exclude_filename"],
"An exclude file name, containing a specification of "
"alignment positions you wish to exclude. Format is "
"similar to Nexus, the file shall contain entries like "
"'100-200 300-400'; to exclude a single column write, "
"e.g., '100-100'. If you use a mixed model, an "
"appropriately adapted model file will be written.",
filename=True,
equate=False,
),
_Option(
["-f", "algorithm"],
r"""
Select algorithm:
a: Rapid Bootstrap analysis and search for best-scoring ML
tree in one program run.
b: Draw bipartition information on a tree provided with '-t'
based on multiple trees (e.g. form a bootstrap) in a file
specified by '-z'.
c: Check if the alignment can be properly read by RAxML.
d: New rapid hill-climbing (DEFAULT).
e: Optimize model+branch lengths for given input tree under
GAMMA/GAMMAI only.
g: Compute per site log Likelihoods for one or more trees
passed via '-z' and write them to a file that can be read
by CONSEL.
h: Compute log likelihood test (SH-test) between best tree
passed via '-t' and a bunch of other trees passed via '-z'.
i: Perform a really thorough bootstrap, refinement of final
bootstrap tree under GAMMA and a more exhaustive algorithm.
j: Generate a bunch of bootstrapped alignment files from an
original alignment file.
m: Compare bipartitions between two bunches of trees passed
via '-t' and '-z' respectively. This will return the
Pearson correlation between all bipartitions found in the
two tree files. A file called
RAxML_bipartitionFrequencies.outputFileName will be
printed that contains the pair-wise bipartition
frequencies of the two sets.
n: Compute the log likelihood score of all trees contained
in a tree file provided by '-z' under GAMMA or
GAMMA+P-Invar.
o: Old and slower rapid hill-climbing.
p: Perform pure stepwise MP addition of new sequences to an
incomplete starting tree.
s: Split up a multi-gene partitioned alignment into the
respective subalignments.
t: Do randomized tree searches on one fixed starting tree.
w: Compute ELW test on a bunch of trees passed via '-z'.
x: Compute pair-wise ML distances, ML model parameters will
be estimated on an MP starting tree or a user-defined
tree passed via '-t', only allowed for GAMMA-based models
of rate heterogeneity.
""",
checker_function=(lambda x: isinstance(x, str) and len(x) == 1),
equate=False,
),
_Option(
["-g", "grouping_constraint"],
"File name of a multifurcating constraint tree. "
"this tree does not need to be comprehensive, i.e. "
"contain all taxa.",
filename=True,
equate=False,
),
_Option(
["-i", "rearrangements"],
"Initial rearrangement setting for the subsequent "
"application of topological changes phase.",
equate=False,
),
_Switch(
["-j", "checkpoints"],
"Write checkpoints (intermediate tree topologies).",
),
_Switch(
["-k", "bootstrap_branch_lengths"],
"Print bootstrapped trees with branch lengths. "
"The bootstraps will run a bit longer, because model "
"parameters will be optimized at the end of each run. "
"Use with CATMIX/PROTMIX or GAMMA/GAMMAI.",
),
_Option(
["-l", "cluster_threshold"],
"Threshold for sequence similarity clustering. "
"RAxML will then print out an alignment to a file "
"called sequenceFileName.reducedBy.threshold that "
"only contains sequences <= the specified threshold "
"that must be between 0.0 and 1.0. RAxML uses the "
"QT-clustering algorithm to perform this task. "
"In addition, a file called "
"RAxML_reducedList.outputFileName will be written "
"that contains clustering information.",
equate=False,
),
_Option(
["-L", "cluster_threshold_fast"],
"Same functionality as '-l', but uses a less "
"exhaustive and thus faster clustering algorithm. "
"This is intended for very large datasets with more "
"than 20,000-30,000 sequences.",
equate=False,
),
_Option(
["-m", "model"],
r"""Model of Nucleotide or Amino Acid Substitution:
NUCLEOTIDES:
GTRCAT : GTR + Optimization of substitution rates + Optimization of site-specific
evolutionary rates which are categorized into numberOfCategories distinct
rate categories for greater computational efficiency
if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program
will use GTRMIX instead
GTRGAMMA : GTR + Optimization of substitution rates + GAMMA model of rate
heterogeneity (alpha parameter will be estimated)
GTRMIX : Inference of the tree under GTRCAT
and thereafter evaluation of the final tree topology under GTRGAMMA
GTRCAT_GAMMA : Inference of the tree with site-specific evolutionary rates.
However, here rates are categorized using the 4 discrete GAMMA rates.
Evaluation of the final tree topology under GTRGAMMA
GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of invariable sites
GTRMIXI : Same as GTRMIX, but with estimate of proportion of invariable sites
GTRCAT_GAMMAI : Same as GTRCAT_GAMMA, but with estimate of proportion of invariable sites
AMINO ACIDS:
PROTCATmatrixName[F] : specified AA matrix + Optimization of substitution rates + Optimization of site-specific
evolutionary rates which are categorized into numberOfCategories distinct
rate categories for greater computational efficiency
if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program
will use PROTMIX... instead
PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of substitution rates + GAMMA model of rate
heterogeneity (alpha parameter will be estimated)
PROTMIXmatrixName[F] : Inference of the tree under specified AA matrix + CAT
and thereafter evaluation of the final tree topology under specified AA matrix + GAMMA
PROTCAT_GAMMAmatrixName[F] : Inference of the tree under specified AA matrix and site-specific evolutionary rates.
However, here rates are categorized using the 4 discrete GAMMA rates.
Evaluation of the final tree topology under specified AA matrix + GAMMA
PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but with estimate of proportion of invariable sites
PROTMIXImatrixName[F] : Same as PROTMIXmatrixName[F], but with estimate of proportion of invariable sites
PROTCAT_GAMMAImatrixName[F] : Same as PROTCAT_GAMMAmatrixName[F], but with estimate of proportion of invariable sites
Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, RTREV, CPREV, VT, BLOSUM62, MTMAM, GTR
With the optional 'F' appendix you can specify if you want to use empirical base frequencies
Please not that for mixed models you can in addition specify the per-gene AA model in
the mixed model file (see manual for details)
""",
equate=False,
),
_Switch(
["-M", "partition_branch_lengths"],
"Switch on estimation of individual per-partition "
"branch lengths. Only has effect when used in "
"combination with 'partition_filename' ('-q'). "
"Branch lengths for individual partitions will be "
"printed to separate files. A weighted average of the "
"branch lengths is computed by using the respective "
"partition lengths. ",
),
_Option(
["-n", "name"],
"Name used in the output files.",
filename=True,
equate=False,
),
_Option(
["-o", "outgroup"],
"Name of a single outgroup or a comma-separated list "
"of outgroups, eg '-o Rat' or '-o Rat,Mouse'. In case "
"that multiple outgroups are not monophyletic the "
"first name in the list will be selected as outgroup. "
"Don't leave spaces between taxon names!",
checker_function=lambda x: len(x.split()) == 1,
equate=False,
),
_Option(
["-q", "partition_filename"],
"File name containing the assignment of models to "
"alignment partitions for multiple models of "
"substitution. For the syntax of this file please "
"consult the RAxML manual.",
filename=True,
equate=False,
),
_Option(
["-p", "parsimony_seed"],
"Random number seed for the parsimony inferences. "
"This allows you to reproduce your results and will "
"help developers debug the program. This option HAS "
"NO EFFECT in the parallel MPI version.",
equate=False,
),
_Option(
["-P", "protein_model"],
"File name of a user-defined AA (Protein) substitution "
"model. This file must contain 420 entries, the first "
"400 being the AA substitution rates (this must be a "
"symmetric matrix) and the last 20 are the empirical "
"base frequencies.",
filename=True,
equate=False,
),
_Option(
["-r", "binary_constraint"],
"File name of a binary constraint tree. "
"This tree does not need to be comprehensive, i.e. "
"contain all taxa.",
filename=True,
equate=False,
),
_Option(
["-s", "sequences"],
"Name of the alignment data file, in PHYLIP format.",
filename=True,
equate=False,
),
_Option(
["-t", "starting_tree"],
"File name of a user starting tree, in Newick format.",
filename=True,
equate=False,
),
_Option(
["-T", "threads"],
"Number of threads to run. "
"PTHREADS VERSION ONLY! "
"Make sure to set this at most the number of CPUs "
"you have on your machine, otherwise, there will be "
"a huge performance decrease!",
equate=False,
),
_Option(
["-u", "num_bootstrap_searches"],
"Number of multiple bootstrap searches per replicate. "
"Use this to obtain better ML trees for each "
"replicate. Default: 1 ML search per bootstrap "
"replicate.",
equate=False,
),
_Switch(["-v", "version"], "Display version information."),
_Option(
["-w", "working_dir"],
"Name of the working directory where RAxML will "
"write its output files. Default: current directory.",
filename=True,
equate=False,
),
_Option(
["-x", "rapid_bootstrap_seed"],
"Random seed for rapid bootstrapping.",
equate=False,
),
_Switch(
["-y", "parsimony"],
"Only compute a parsimony starting tree, then exit.",
),
_Option(
["-z", "bipartition_filename"],
"Name of a file containing multiple trees, e.g. from "
"a bootstrap run, that shall be used to draw "
"bipartition values onto a tree provided with '-t'. "
"It can also be used to compute per-site log "
"likelihoods in combination with '-f g', and to read "
"a bunch of trees for a couple of other options "
"('-f h', '-f m', '-f n').",
filename=True,
equate=False,
),
_Option(
["-N", "-#", "num_replicates"],
"Number of alternative runs on distinct starting trees. "
"In combination with the '-b' option, this will invoke a "
"multiple bootstrap analysis. "
"DEFAULT: 1 single analysis."
"Note that '-N' has been added as an alternative since "
"'-#' sometimes caused problems with certain MPI job "
"submission systems, since '-#' is often used to start "
"comments. ",
equate=False,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
# ENH: enforce -s, -n and -m
if not self.parsimony_seed:
self.parsimony_seed = 10000
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,19 +0,0 @@
# Copyright 2011 by Eric Talevich. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Phylogenetics command line tool wrappers (OBSOLETE).
We have decided to remove this module in future, and instead recommend
building your command and invoking it via the subprocess module directly.
"""
from ._Fasttree import FastTreeCommandline
from ._Phyml import PhymlCommandline
from ._Raxml import RaxmlCommandline
# Make this explicit, then they show up in the API docs
__all__ = ("PhymlCommandline", "RaxmlCommandline", "FastTreeCommandline")

View File

@ -1,948 +0,0 @@
# Copyright 2009 by Tiago Antao <tiagoantao@gmail.com>. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Module to control GenePop."""
import os
import re
import shutil
import tempfile
from Bio.Application import _Argument
from Bio.Application import AbstractCommandline
def _gp_float(tok):
"""Get a float from a token, if it fails, returns the string (PRIVATE)."""
try:
return float(tok)
except ValueError:
return str(tok)
def _gp_int(tok):
"""Get a int from a token, if it fails, returns the string (PRIVATE)."""
try:
return int(tok)
except ValueError:
return str(tok)
def _read_allele_freq_table(f):
line = f.readline()
while " --" not in line:
if line == "":
raise StopIteration
if "No data" in line:
return None, None
line = f.readline()
alleles = [x for x in f.readline().rstrip().split(" ") if x != ""]
alleles = [_gp_int(x) for x in alleles]
line = f.readline().rstrip()
table = []
while line != "":
parts = [x for x in line.split(" ") if x != ""]
try:
table.append(
(parts[0], [_gp_float(x) for x in parts[1:-1]], _gp_int(parts[-1]))
)
except ValueError:
table.append((parts[0], [None] * len(alleles), 0))
line = f.readline().rstrip()
return alleles, table
def _read_table(f, funs):
table = []
line = f.readline().rstrip()
while "---" not in line:
line = f.readline().rstrip()
line = f.readline().rstrip()
while "===" not in line and "---" not in line and line != "":
toks = [x for x in line.split(" ") if x != ""]
parts = []
for i, tok in enumerate(toks):
try:
parts.append(funs[i](tok))
except ValueError:
parts.append(tok) # Could not cast
table.append(tuple(parts))
line = f.readline().rstrip()
return table
def _read_triangle_matrix(f):
matrix = []
line = f.readline().rstrip()
while line != "":
matrix.append([_gp_float(x) for x in [y for y in line.split(" ") if y != ""]])
line = f.readline().rstrip()
return matrix
def _read_headed_triangle_matrix(f):
matrix = {}
header = f.readline().rstrip()
if "---" in header or "===" in header:
header = f.readline().rstrip()
nlines = len([x for x in header.split(" ") if x != ""]) - 1
for line_pop in range(nlines):
line = f.readline().rstrip()
vals = [x for x in line.split(" ")[1:] if x != ""]
clean_vals = []
for val in vals:
try:
clean_vals.append(_gp_float(val))
except ValueError:
clean_vals.append(None)
for col_pop, clean_val in enumerate(clean_vals):
matrix[(line_pop + 1, col_pop)] = clean_val
return matrix
def _hw_func(stream, is_locus, has_fisher=False):
line = stream.readline()
if is_locus:
hook = "Locus "
else:
hook = "Pop : "
while line != "":
if line.lstrip().startswith(hook):
stream.readline()
stream.readline()
stream.readline()
table = _read_table(
stream, [str, _gp_float, _gp_float, _gp_float, _gp_float, _gp_int, str]
)
# loci might mean pop if hook="Locus "
loci = {}
for entry in table:
if len(entry) < 4:
loci[entry[0]] = None
else:
locus, p, se, fis_wc, fis_rh, steps = entry[:-1]
if se == "-":
se = None
loci[locus] = p, se, fis_wc, fis_rh, steps
return loci
line = stream.readline()
# self.done = True
raise StopIteration
class _FileIterator:
"""Return an iterator which crawls over a stream of lines with a function (PRIVATE).
The generator function is expected to yield a tuple, while
consuming input
"""
def __init__(self, func, fname, handle=None):
self.func = func
if handle is None:
self.stream = open(fname)
else:
# For special cases where calling code wants to
# seek into the file before starting:
self.stream = handle
self.fname = fname
self.done = False
def __iter__(self):
if self.done:
self.done = True
raise StopIteration
return self
def __next__(self):
return self.func(self)
def __del__(self):
self.stream.close()
os.remove(self.fname)
class _GenePopCommandline(AbstractCommandline):
"""Return a Command Line Wrapper for GenePop (PRIVATE)."""
def __init__(self, genepop_dir=None, cmd="Genepop", **kwargs):
self.parameters = [
_Argument(["command"], "GenePop option to be called", is_required=True),
_Argument(["mode"], "Should always be batch", is_required=True),
_Argument(["input"], "Input file", is_required=True),
_Argument(["Dememorization"], "Dememorization step"),
_Argument(["BatchNumber"], "Number of MCMC batches"),
_Argument(["BatchLength"], "Length of MCMC chains"),
_Argument(["HWtests"], "Enumeration or MCMC"),
_Argument(["IsolBDstatistic"], "IBD statistic (a or e)"),
_Argument(["MinimalDistance"], "Minimal IBD distance"),
_Argument(["GeographicScale"], "Log or Linear"),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
self.set_parameter("mode", "Mode=Batch")
def set_menu(self, option_list):
"""Set the menu option.
Example set_menu([6,1]) = get all F statistics (menu 6.1)
"""
self.set_parameter(
"command", "MenuOptions=" + ".".join(str(x) for x in option_list)
)
def set_input(self, fname):
"""Set the input file name."""
self.set_parameter("input", "InputFile=" + fname)
class GenePopController:
"""Define a class to interface with the GenePop program."""
def __init__(self, genepop_dir=None):
"""Initialize the controller.
genepop_dir is the directory where GenePop is.
The binary should be called Genepop (capital G)
"""
self.controller = _GenePopCommandline(genepop_dir)
def _get_opts(self, dememorization, batches, iterations, enum_test=None):
opts = {}
opts["Dememorization"] = dememorization
opts["BatchNumber"] = batches
opts["BatchLength"] = iterations
if enum_test is not None:
if enum_test is True:
opts["HWtests"] = "Enumeration"
else:
opts["HWtests"] = "MCMC"
return opts
def _run_genepop(self, extensions, option, fname, opts=None):
if opts is None:
opts = {}
cwd = os.getcwd()
temp_dir = tempfile.mkdtemp()
os.chdir(temp_dir)
self.controller.set_menu(option)
if os.path.isabs(fname):
self.controller.set_input(fname)
else:
self.controller.set_input(cwd + os.sep + fname)
for opt in opts:
self.controller.set_parameter(opt, opt + "=" + str(opts[opt]))
self.controller() # checks error level is zero
os.chdir(cwd)
shutil.rmtree(temp_dir)
def _test_pop_hz_both(
self,
fname,
type,
ext,
enum_test=True,
dememorization=10000,
batches=20,
iterations=5000,
):
"""Use Hardy-Weinberg test for heterozygote deficiency/excess (PRIVATE).
Returns a population iterator containing a dictionary where
dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps).
Some loci have a None if the info is not available.
SE might be none (for enumerations).
"""
opts = self._get_opts(dememorization, batches, iterations, enum_test)
self._run_genepop([ext], [1, type], fname, opts)
def hw_func(self):
return _hw_func(self.stream, False)
return _FileIterator(hw_func, fname + ext)
def _test_global_hz_both(
self,
fname,
type,
ext,
enum_test=True,
dememorization=10000,
batches=20,
iterations=5000,
):
"""Use Global Hardy-Weinberg test for heterozygote deficiency/excess (PRIVATE).
Returns a triple with:
- A list per population containing (pop_name, P-val, SE, switches).
Some pops have a None if the info is not available.
SE might be none (for enumerations).
- A list per loci containing (locus_name, P-val, SE, switches).
Some loci have a None if the info is not available.
SE might be none (for enumerations).
- Overall results (P-val, SE, switches).
"""
opts = self._get_opts(dememorization, batches, iterations, enum_test)
self._run_genepop([ext], [1, type], fname, opts)
def hw_pop_func(self):
return _read_table(self.stream, [str, _gp_float, _gp_float, _gp_float])
with open(fname + ext) as f1:
line = f1.readline()
while "by population" not in line:
line = f1.readline()
pop_p = _read_table(f1, [str, _gp_float, _gp_float, _gp_float])
with open(fname + ext) as f2:
line = f2.readline()
while "by locus" not in line:
line = f2.readline()
loc_p = _read_table(f2, [str, _gp_float, _gp_float, _gp_float])
with open(fname + ext) as f:
line = f.readline()
while "all locus" not in line:
line = f.readline()
f.readline()
f.readline()
f.readline()
f.readline()
line = f.readline().rstrip()
p, se, switches = tuple(
_gp_float(x) for x in [y for y in line.split(" ") if y != ""]
)
return pop_p, loc_p, (p, se, switches)
# 1.1
def test_pop_hz_deficiency(
self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000
):
"""Use Hardy-Weinberg test for heterozygote deficiency.
Returns a population iterator containing a dictionary where
dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps).
Some loci have a None if the info is not available.
SE might be none (for enumerations).
"""
return self._test_pop_hz_both(
fname, 1, ".D", enum_test, dememorization, batches, iterations
)
# 1.2
def test_pop_hz_excess(
self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000
):
"""Use Hardy-Weinberg test for heterozygote deficiency.
Returns a population iterator containing a dictionary where
dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps).
Some loci have a None if the info is not available.
SE might be none (for enumerations).
"""
return self._test_pop_hz_both(
fname, 2, ".E", enum_test, dememorization, batches, iterations
)
# 1.3 P file
def test_pop_hz_prob(
self,
fname,
ext,
enum_test=False,
dememorization=10000,
batches=20,
iterations=5000,
):
"""Use Hardy-Weinberg test based on probability.
Returns 2 iterators and a final tuple:
1. Returns a loci iterator containing:
- A dictionary[pop_pos]=(P-val, SE, Fis-WC, Fis-RH, steps).
Some pops have a None if the info is not available.
SE might be none (for enumerations).
- Result of Fisher's test (Chi2, deg freedom, prob).
2. Returns a population iterator containing:
- A dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps).
Some loci have a None if the info is not available.
SE might be none (for enumerations).
- Result of Fisher's test (Chi2, deg freedom, prob).
3. Final tuple (Chi2, deg freedom, prob).
"""
opts = self._get_opts(dememorization, batches, iterations, enum_test)
self._run_genepop([ext], [1, 3], fname, opts)
def hw_prob_loci_func(self):
return _hw_func(self.stream, True, True)
def hw_prob_pop_func(self):
return _hw_func(self.stream, False, True)
shutil.copyfile(fname + ".P", fname + ".P2")
return (
_FileIterator(hw_prob_loci_func, fname + ".P"),
_FileIterator(hw_prob_pop_func, fname + ".P2"),
)
# 1.4
def test_global_hz_deficiency(
self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000
):
"""Use Global Hardy-Weinberg test for heterozygote deficiency.
Returns a triple with:
- An list per population containing (pop_name, P-val, SE, switches).
Some pops have a None if the info is not available.
SE might be none (for enumerations).
- An list per loci containing (locus_name, P-val, SE, switches).
Some loci have a None if the info is not available.
SE might be none (for enumerations).
- Overall results (P-val, SE, switches).
"""
return self._test_global_hz_both(
fname, 4, ".DG", enum_test, dememorization, batches, iterations
)
# 1.5
def test_global_hz_excess(
self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000
):
"""Use Global Hardy-Weinberg test for heterozygote excess.
Returns a triple with:
- A list per population containing (pop_name, P-val, SE, switches).
Some pops have a None if the info is not available.
SE might be none (for enumerations).
- A list per loci containing (locus_name, P-val, SE, switches).
Some loci have a None if the info is not available.
SE might be none (for enumerations).
- Overall results (P-val, SE, switches)
"""
return self._test_global_hz_both(
fname, 5, ".EG", enum_test, dememorization, batches, iterations
)
# 2.1
def test_ld(self, fname, dememorization=10000, batches=20, iterations=5000):
"""Test for linkage disequilibrium on each pair of loci in each population."""
opts = self._get_opts(dememorization, batches, iterations)
self._run_genepop([".DIS"], [2, 1], fname, opts)
def ld_pop_func(self):
current_pop = None
line = self.stream.readline().rstrip()
if line == "":
self.done = True
raise StopIteration
toks = [x for x in line.split(" ") if x != ""]
pop, locus1, locus2 = toks[0], toks[1], toks[2]
if not hasattr(self, "start_locus1"):
start_locus1, start_locus2 = locus1, locus2
current_pop = -1
if locus1 == start_locus1 and locus2 == start_locus2:
current_pop += 1
if toks[3] == "No":
return current_pop, pop, (locus1, locus2), None
p, se, switches = _gp_float(toks[3]), _gp_float(toks[4]), _gp_int(toks[5])
return current_pop, pop, (locus1, locus2), (p, se, switches)
def ld_func(self):
line = self.stream.readline().rstrip()
if line == "":
self.done = True
raise StopIteration
toks = [x for x in line.split(" ") if x != ""]
locus1, locus2 = toks[0], toks[2]
try:
chi2, df, p = _gp_float(toks[3]), _gp_int(toks[4]), _gp_float(toks[5])
except ValueError:
return (locus1, locus2), None
return (locus1, locus2), (chi2, df, p)
f1 = open(fname + ".DIS")
line = f1.readline()
while "----" not in line:
line = f1.readline()
shutil.copyfile(fname + ".DIS", fname + ".DI2")
f2 = open(fname + ".DI2")
line = f2.readline()
while "Locus pair" not in line:
line = f2.readline()
while "----" not in line:
line = f2.readline()
return (
_FileIterator(ld_pop_func, fname + ".DIS", f1),
_FileIterator(ld_func, fname + ".DI2", f2),
)
# 2.2
def create_contingency_tables(self, fname):
"""Provision for creating Genotypic contingency tables."""
raise NotImplementedError
# 3.1 PR/GE files
def test_genic_diff_all(
self, fname, dememorization=10000, batches=20, iterations=5000
):
"""Provision for Genic differentiation for all populations."""
raise NotImplementedError
# 3.2 PR2/GE2 files
def test_genic_diff_pair(
self, fname, dememorization=10000, batches=20, iterations=5000
):
"""Provision for Genic differentiation for all population pairs."""
raise NotImplementedError
# 3.3 G files
def test_genotypic_diff_all(
self, fname, dememorization=10000, batches=20, iterations=5000
):
"""Provision for Genotypic differentiation for all populations."""
raise NotImplementedError
# 3.4 2G2 files
def test_genotypic_diff_pair(
self, fname, dememorization=10000, batches=20, iterations=5000
):
"""Provision for Genotypic differentiation for all population pairs."""
raise NotImplementedError
# 4
def estimate_nm(self, fname):
"""Estimate the Number of Migrants.
Parameters:
- fname - file name
Returns
- Mean sample size
- Mean frequency of private alleles
- Number of migrants for Ne=10
- Number of migrants for Ne=25
- Number of migrants for Ne=50
- Number of migrants after correcting for expected size
"""
self._run_genepop(["PRI"], [4], fname)
with open(fname + ".PRI") as f:
lines = f.readlines() # Small file, it is ok
for line in lines:
m = re.search("Mean sample size: ([.0-9]+)", line)
if m is not None:
mean_sample_size = _gp_float(m.group(1))
m = re.search(r"Mean frequency of private alleles p\(1\)= ([.0-9]+)", line)
if m is not None:
mean_priv_alleles = _gp_float(m.group(1))
m = re.search("N=10: ([.0-9]+)", line)
if m is not None:
mig10 = _gp_float(m.group(1))
m = re.search("N=25: ([.0-9]+)", line)
if m is not None:
mig25 = _gp_float(m.group(1))
m = re.search("N=50: ([.0-9]+)", line)
if m is not None:
mig50 = _gp_float(m.group(1))
m = re.search("for size= ([.0-9]+)", line)
if m is not None:
mig_corrected = _gp_float(m.group(1))
os.remove(fname + ".PRI")
return mean_sample_size, mean_priv_alleles, mig10, mig25, mig50, mig_corrected
# 5.1
def calc_allele_genotype_freqs(self, fname):
"""Calculate allele and genotype frequencies per locus and per sample.
Parameters:
- fname - file name
Returns tuple with 2 elements:
- Population iterator with
- population name
- Locus dictionary with key = locus name and content tuple as
Genotype List with
(Allele1, Allele2, observed, expected)
(expected homozygotes, observed hm,
expected heterozygotes, observed ht)
Allele frequency/Fis dictionary with allele as key and
(count, frequency, Fis Weir & Cockerham)
- Totals as a pair
- count
- Fis Weir & Cockerham,
- Fis Robertson & Hill
- Locus iterator with
- Locus name
- allele list
- Population list with a triple
- population name
- list of allele frequencies in the same order as allele list above
- number of genes
Will create a file called fname.INF
"""
self._run_genepop(["INF"], [5, 1], fname)
# First pass, general information
# num_loci = None
# num_pops = None
# with open(fname + ".INF") as f:
# line = f.readline()
# while (num_loci is None or num_pops is None) and line != '':
# m = re.search("Number of populations detected : ([0-9+])", l)
# if m is not None:
# num_pops = _gp_int(m.group(1))
# m = re.search("Number of loci detected : ([0-9+])", l)
# if m is not None:
# num_loci = _gp_int(m.group(1))
# line = f.readline()
def pop_parser(self):
if hasattr(self, "old_line"):
line = self.old_line
del self.old_line
else:
line = self.stream.readline()
loci_content = {}
while line != "":
line = line.rstrip()
if "Tables of allelic frequencies for each locus" in line:
return self.curr_pop, loci_content
match = re.match(".*Pop: (.+) Locus: (.+)", line)
if match is not None:
pop = match.group(1).rstrip()
locus = match.group(2)
if not hasattr(self, "first_locus"):
self.first_locus = locus
if hasattr(self, "curr_pop"):
if self.first_locus == locus:
old_pop = self.curr_pop
# self.curr_pop = pop
self.old_line = line
del self.first_locus
del self.curr_pop
return old_pop, loci_content
self.curr_pop = pop
else:
line = self.stream.readline()
continue
geno_list = []
line = self.stream.readline()
if "No data" in line:
continue
while "Genotypes Obs." not in line:
line = self.stream.readline()
while line != "\n":
m2 = re.match(" +([0-9]+) , ([0-9]+) *([0-9]+) *(.+)", line)
if m2 is not None:
geno_list.append(
(
_gp_int(m2.group(1)),
_gp_int(m2.group(2)),
_gp_int(m2.group(3)),
_gp_float(m2.group(4)),
)
)
else:
line = self.stream.readline()
continue
line = self.stream.readline()
while "Expected number of ho" not in line:
line = self.stream.readline()
expHo = _gp_float(line[38:])
line = self.stream.readline()
obsHo = _gp_int(line[38:])
line = self.stream.readline()
expHe = _gp_float(line[38:])
line = self.stream.readline()
obsHe = _gp_int(line[38:])
line = self.stream.readline()
while "Sample count" not in line:
line = self.stream.readline()
line = self.stream.readline()
freq_fis = {}
overall_fis = None
while "----" not in line:
vals = [x for x in line.rstrip().split(" ") if x != ""]
if vals[0] == "Tot":
overall_fis = (
_gp_int(vals[1]),
_gp_float(vals[2]),
_gp_float(vals[3]),
)
else:
freq_fis[_gp_int(vals[0])] = (
_gp_int(vals[1]),
_gp_float(vals[2]),
_gp_float(vals[3]),
)
line = self.stream.readline()
loci_content[locus] = (
geno_list,
(expHo, obsHo, expHe, obsHe),
freq_fis,
overall_fis,
)
self.done = True
raise StopIteration
def locus_parser(self):
line = self.stream.readline()
while line != "":
line = line.rstrip()
match = re.match(" Locus: (.+)", line)
if match is not None:
locus = match.group(1)
alleles, table = _read_allele_freq_table(self.stream)
return locus, alleles, table
line = self.stream.readline()
self.done = True
raise StopIteration
shutil.copyfile(fname + ".INF", fname + ".IN2")
pop_iter = _FileIterator(pop_parser, fname + ".INF")
locus_iter = _FileIterator(locus_parser, fname + ".IN2")
return (pop_iter, locus_iter)
def _calc_diversities_fis(self, fname, ext):
self._run_genepop([ext], [5, 2], fname)
with open(fname + ext) as f:
line = f.readline()
while line != "":
line = line.rstrip()
if line.startswith(
"Statistics per sample over all loci with at least two individuals typed"
):
avg_fis = _read_table(f, [str, _gp_float, _gp_float, _gp_float])
avg_Qintra = _read_table(f, [str, _gp_float])
line = f.readline()
def fis_func(self):
line = self.stream.readline()
while line != "":
line = line.rstrip()
m = re.search("Locus: (.+)", line)
if m is not None:
locus = m.group(1)
self.stream.readline()
if "No complete" in self.stream.readline():
return locus, None
self.stream.readline()
fis_table = _read_table(
self.stream, [str, _gp_float, _gp_float, _gp_float]
)
self.stream.readline()
avg_qinter, avg_fis = tuple(
_gp_float(x)
for x in [
y for y in self.stream.readline().split(" ") if y != ""
]
)
return locus, fis_table, avg_qinter, avg_fis
line = self.stream.readline()
self.done = True
raise StopIteration
return _FileIterator(fis_func, fname + ext), avg_fis, avg_Qintra
# 5.2
def calc_diversities_fis_with_identity(self, fname):
"""Compute identity-base Gene diversities and Fis."""
return self._calc_diversities_fis(fname, ".DIV")
# 5.3
def calc_diversities_fis_with_size(self, fname):
"""Provision to Computer Allele size-based Gene diversities and Fis."""
raise NotImplementedError
# 6.1 Less genotype frequencies
def calc_fst_all(self, fname):
"""Execute GenePop and gets Fst/Fis/Fit (all populations).
Parameters:
- fname - file name
Returns:
- (multiLocusFis, multiLocusFst, multiLocus Fit),
- Iterator of tuples
(Locus name, Fis, Fst, Fit, Qintra, Qinter)
Will create a file called ``fname.FST``.
This does not return the genotype frequencies.
"""
self._run_genepop([".FST"], [6, 1], fname)
with open(fname + ".FST") as f:
line = f.readline()
while line != "":
if line.startswith(" All:"):
toks = [x for x in line.rstrip().split(" ") if x != ""]
try:
allFis = _gp_float(toks[1])
except ValueError:
allFis = None
try:
allFst = _gp_float(toks[2])
except ValueError:
allFst = None
try:
allFit = _gp_float(toks[3])
except ValueError:
allFit = None
line = f.readline()
def proc(self):
if hasattr(self, "last_line"):
line = self.last_line
del self.last_line
else:
line = self.stream.readline()
locus = None
fis = None
fst = None
fit = None
qintra = None
qinter = None
while line != "":
line = line.rstrip()
if line.startswith(" Locus:"):
if locus is not None:
self.last_line = line
return locus, fis, fst, fit, qintra, qinter
else:
locus = line.split(":")[1].lstrip()
elif line.startswith("Fis^="):
fis = _gp_float(line.split(" ")[1])
elif line.startswith("Fst^="):
fst = _gp_float(line.split(" ")[1])
elif line.startswith("Fit^="):
fit = _gp_float(line.split(" ")[1])
elif line.startswith("1-Qintra^="):
qintra = _gp_float(line.split(" ")[1])
elif line.startswith("1-Qinter^="):
qinter = _gp_float(line.split(" ")[1])
return locus, fis, fst, fit, qintra, qinter
line = self.stream.readline()
if locus is not None:
return locus, fis, fst, fit, qintra, qinter
self.stream.close()
self.done = True
raise StopIteration
return (allFis, allFst, allFit), _FileIterator(proc, fname + ".FST")
# 6.2
def calc_fst_pair(self, fname):
"""Estimate spatial structure from Allele identity for all population pairs."""
self._run_genepop([".ST2", ".MIG"], [6, 2], fname)
with open(fname + ".ST2") as f:
line = f.readline()
while line != "":
line = line.rstrip()
if line.startswith("Estimates for all loci"):
avg_fst = _read_headed_triangle_matrix(f)
line = f.readline()
def loci_func(self):
line = self.stream.readline()
while line != "":
line = line.rstrip()
m = re.search(" Locus: (.+)", line)
if m is not None:
locus = m.group(1)
matrix = _read_headed_triangle_matrix(self.stream)
return locus, matrix
line = self.stream.readline()
self.done = True
raise StopIteration
os.remove(fname + ".MIG")
return _FileIterator(loci_func, fname + ".ST2"), avg_fst
# 6.3
def calc_rho_all(self, fname):
"""Provision for estimating spatial structure from Allele size for all populations."""
raise NotImplementedError
# 6.4
def calc_rho_pair(self, fname):
"""Provision for estimating spatial structure from Allele size for all population pairs."""
raise NotImplementedError
def _calc_ibd(self, fname, sub, stat="a", scale="Log", min_dist=0.00001):
"""Calculate isolation by distance statistics (PRIVATE)."""
self._run_genepop(
[".GRA", ".MIG", ".ISO"],
[6, sub],
fname,
opts={
"MinimalDistance": min_dist,
"GeographicScale": scale,
"IsolBDstatistic": stat,
},
)
with open(fname + ".ISO") as f:
f.readline()
f.readline()
f.readline()
f.readline()
estimate = _read_triangle_matrix(f)
f.readline()
f.readline()
distance = _read_triangle_matrix(f)
f.readline()
match = re.match("a = (.+), b = (.+)", f.readline().rstrip())
a = _gp_float(match.group(1))
b = _gp_float(match.group(2))
f.readline()
f.readline()
match = re.match(" b=(.+)", f.readline().rstrip())
bb = _gp_float(match.group(1))
match = re.match(r".*\[(.+) ; (.+)\]", f.readline().rstrip())
bblow = _gp_float(match.group(1))
bbhigh = _gp_float(match.group(2))
os.remove(fname + ".MIG")
os.remove(fname + ".GRA")
os.remove(fname + ".ISO")
return estimate, distance, (a, b), (bb, bblow, bbhigh)
# 6.5
def calc_ibd_diplo(self, fname, stat="a", scale="Log", min_dist=0.00001):
"""Calculate isolation by distance statistics for diploid data.
See _calc_ibd for parameter details.
Note that each pop can only have a single individual and
the individual name has to be the sample coordinates.
"""
return self._calc_ibd(fname, 5, stat, scale, min_dist)
# 6.6
def calc_ibd_haplo(self, fname, stat="a", scale="Log", min_dist=0.00001):
"""Calculate isolation by distance statistics for haploid data.
See _calc_ibd for parameter details.
Note that each pop can only have a single individual and
the individual name has to be the sample coordinates.
"""
return self._calc_ibd(fname, 6, stat, scale, min_dist)

View File

@ -1,199 +0,0 @@
# Copyright 2009 by Tiago Antao <tiagoantao@gmail.com>. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Control GenePop through an easier interface.
This interface is less efficient than the standard GenePopControler
"""
from Bio.PopGen import GenePop
from .Controller import GenePopController
class EasyController:
"""Define a class for an easier interface with the GenePop program."""
def __init__(self, fname, genepop_dir=None):
"""Initialize the controller.
genepop_dir is the directory where GenePop is.
The binary should be called Genepop (capital G)
"""
self._fname = fname
self._controller = GenePopController(genepop_dir)
self.__fst_pair_locus = {} # More caches like this needed!
self.__allele_frequency = {} # More caches like this needed!
def get_basic_info(self):
"""Obtain the population list and loci list from the file."""
with open(self._fname) as f:
rec = GenePop.read(f)
return rec.pop_list, rec.loci_list
# 1.3
def test_hw_pop(self, pop_pos, test_type="probability"):
"""Perform Hardy-Weinberg test on the given position."""
if test_type == "deficiency":
hw_res = self._controller.test_pop_hz_deficiency(self._fname)
elif test_type == "excess":
hw_res = self._controller.test_pop_hz_excess(self._fname)
else:
loci_res, hw_res, fisher_full = self._controller.test_pop_hz_prob(
self._fname, ".P"
)
for i in range(pop_pos - 1):
next(hw_res)
return next(hw_res)
# 1.4
def test_hw_global(
self,
test_type="deficiency",
enum_test=True,
dememorization=10000,
batches=20,
iterations=5000,
):
"""Perform Hardy-Weinberg global Heterozygote test."""
if test_type == "deficiency":
pop_res, loc_res, all = self._controller.test_global_hz_deficiency(
self._fname, enum_test, dememorization, batches, iterations
)
else:
pop_res, loc_res, all = self._controller.test_global_hz_excess(
self._fname, enum_test, dememorization, batches, iterations
)
return list(pop_res), list(loc_res), all
# 2.1
def test_ld_all_pair(
self, locus1, locus2, dememorization=10000, batches=20, iterations=5000
):
"""Test for linkage disequilibrium for each pair of loci in each population."""
all_ld = self._controller.test_ld(
self._fname, dememorization, batches, iterations
)[1]
for ld_case in all_ld:
(l1, l2), result = ld_case
if (l1 == locus1 and l2 == locus2) or (l1 == locus2 and l2 == locus1):
return result
def estimate_nm(self):
"""Estimate Nm. Just a simple bridge."""
return self._controller.estimate_nm(self._fname)
def get_heterozygosity_info(self, pop_pos, locus_name):
"""Return the heterozygosity info for a certain locus on a population.
Returns (Expected homozygotes, observed homozygotes,
Expected heterozygotes, observed heterozygotes)
"""
geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname)
pop_iter, loc_iter = geno_freqs
pops = list(pop_iter)
return pops[pop_pos][1][locus_name][1]
def get_genotype_count(self, pop_pos, locus_name):
"""Return the genotype counts for a certain population and locus."""
geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname)
pop_iter, loc_iter = geno_freqs
pop_iter = list(pop_iter)
return pop_iter[pop_pos][1][locus_name][0]
def get_fis(self, pop_pos, locus_name):
"""Return the Fis for a certain population and locus.
Below CW means Cockerham and Weir and RH means Robertson and Hill.
Returns a pair:
- dictionary [allele] = (repetition count, frequency, Fis CW )
with information for each allele
- a triple with total number of alleles, Fis CW, Fis RH
"""
geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname)
pop_iter, loc_iter = geno_freqs
pops = list(pop_iter)
return pops[pop_pos][1][locus_name][2:]
def get_alleles(self, pop_pos, locus_name):
"""Return the alleles for a certain population and locus."""
geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname)
pop_iter, loc_iter = geno_freqs
pop_iter = list(pop_iter)
return list(pop_iter[pop_pos][1][locus_name][2].keys())
def get_alleles_all_pops(self, locus_name):
"""Return the alleles for a certain population and locus."""
geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname)
pop_iter, loc_iter = geno_freqs
for locus_info in loc_iter:
if locus_info[0] == locus_name:
return locus_info[1]
def get_allele_frequency(self, pop_pos, locus_name):
"""Calculate the allele frequency for a certain locus on a population."""
if len(self.__allele_frequency) == 0:
geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname)
pop_iter, loc_iter = geno_freqs
for locus_info in loc_iter:
if locus_info[0] is None:
self.__allele_frequency[locus_info[0]] = None, None
else:
self.__allele_frequency[locus_info[0]] = locus_info[1:]
info = self.__allele_frequency[locus_name]
pop_name, freqs, total = info[1][pop_pos]
allele_freq = {}
alleles = info[0]
for i, allele in enumerate(alleles):
allele_freq[allele] = freqs[i]
return total, allele_freq
def get_multilocus_f_stats(self):
"""Return the multilocus F stats.
Explain averaging.
Returns Fis(CW), Fst, Fit
"""
return self._controller.calc_fst_all(self._fname)[0]
def get_f_stats(self, locus_name):
"""Return F stats for a locus.
Returns Fis(CW), Fst, Fit, Qintra, Qinter
"""
loci_iter = self._controller.calc_fst_all(self._fname)[1]
for name, fis, fst, fit, qintra, qinter in loci_iter:
if name == locus_name:
return fis, fst, fit, qintra, qinter
def get_avg_fis(self):
"""Calculate identity-base average Fis."""
return self._controller.calc_diversities_fis_with_identity(self._fname)[1]
def get_avg_fst_pair(self):
"""Calculate Allele size-base average Fis for all population pairs."""
return self._controller.calc_fst_pair(self._fname)[1]
def get_avg_fst_pair_locus(self, locus):
"""Calculate Allele size-base average Fis for all population pairs of the given locus."""
if len(self.__fst_pair_locus) == 0:
iter = self._controller.calc_fst_pair(self._fname)[0]
for locus_info in iter:
self.__fst_pair_locus[locus_info[0]] = locus_info[1]
return self.__fst_pair_locus[locus]
def calc_ibd(self, is_diplo=True, stat="a", scale="Log", min_dist=0.00001):
"""Calculate isolation by distance statistics for Diploid or Haploid."""
if is_diplo:
return self._controller.calc_ibd_diplo(self._fname, stat, scale, min_dist)
else:
return self._controller.calc_ibd_haplo(self._fname, stat, scale, min_dist)

View File

@ -1,217 +0,0 @@
# Copyright 2009 by Osvaldo Zagordi. All rights reserved.
# Revisions copyright 2010 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the short read aligner Novoalign by Novocraft."""
from Bio.Application import _Option
from Bio.Application import AbstractCommandline
class NovoalignCommandline(AbstractCommandline):
"""Command line wrapper for novoalign by Novocraft.
See www.novocraft.com - novoalign is a short read alignment program.
Examples
--------
>>> from Bio.Sequencing.Applications import NovoalignCommandline
>>> novoalign_cline = NovoalignCommandline(database='some_db',
... readfile='some_seq.txt')
>>> print(novoalign_cline)
novoalign -d some_db -f some_seq.txt
As with all the Biopython application wrappers, you can also add or
change options after creating the object:
>>> novoalign_cline.format = 'PRBnSEQ'
>>> novoalign_cline.r_method='0.99' # limited valid values
>>> novoalign_cline.fragment = '250 20' # must be given as a string
>>> novoalign_cline.miRNA = 100
>>> print(novoalign_cline)
novoalign -d some_db -f some_seq.txt -F PRBnSEQ -r 0.99 -i 250 20 -m 100
You would typically run the command line with novoalign_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Last checked against version: 2.05.04
"""
def __init__(self, cmd="novoalign", **kwargs):
"""Initialize the class."""
READ_FORMAT = ["FA", "SLXFQ", "STDFQ", "ILMFQ", "PRB", "PRBnSEQ"]
REPORT_FORMAT = ["Native", "Pairwise", "SAM"]
REPEAT_METHOD = ["None", "Random", "All", "Exhaustive", "0.99"]
self.parameters = [
_Option(
["-d", "database"], "database filename", filename=True, equate=False
),
_Option(["-f", "readfile"], "read file", filename=True, equate=False),
_Option(
["-F", "format"],
f"Format of read files.\n\nAllowed values: {', '.join(READ_FORMAT)}",
checker_function=lambda x: x in READ_FORMAT,
equate=False,
),
# Alignment scoring options
_Option(
["-t", "threshold"],
"Threshold for alignment score",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-g", "gap_open"],
"Gap opening penalty [default: 40]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-x", "gap_extend"],
"Gap extend penalty [default: 15]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-u", "unconverted"],
"Experimental: unconverted cytosines penalty in bisulfite mode\n\n"
"Default: no penalty",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Quality control and read filtering
_Option(
["-l", "good_bases"],
"Minimum number of good quality bases [default: log(N_g, 4) + 5]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-h", "homopolymer"],
"Homopolymer read filter [default: 20; disable: negative value]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Read preprocessing options
_Option(
["-a", "adapter3"],
"Strips a 3' adapter sequence prior to alignment.\n\n"
"With paired ends two adapters can be specified",
checker_function=lambda x: isinstance(x, str),
equate=False,
),
_Option(
["-n", "truncate"],
"Truncate to specific length before alignment",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-s", "trimming"],
"If fail to align, trim by s bases until they map or become shorter than l.\n\n"
"Ddefault: 2",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-5", "adapter5"],
"Strips a 5' adapter sequence.\n\n"
"Similar to -a (adaptor3), but on the 5' end.",
checker_function=lambda x: isinstance(x, str),
equate=False,
),
# Reporting options
_Option(
["-o", "report"],
"Specifies the report format.\n\nAllowed values: %s\nDefault: Native"
% ", ".join(REPORT_FORMAT),
checker_function=lambda x: x in REPORT_FORMAT,
equate=False,
),
_Option(
["-Q", "quality"],
"Lower threshold for an alignment to be reported [default: 0]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-R", "repeats"],
"If score difference is higher, report repeats.\n\n"
"Otherwise -r read method applies [default: 5]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-r", "r_method"],
"Methods to report reads with multiple matches.\n\n"
"Allowed values: %s\n"
"'All' and 'Exhaustive' accept limits." % ", ".join(REPEAT_METHOD),
checker_function=lambda x: x.split()[0] in REPEAT_METHOD,
equate=False,
),
_Option(
["-e", "recorded"],
"Alignments recorded with score equal to the best.\n\n"
"Default: 1000 in default read method, otherwise no limit.",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-q", "qual_digits"],
"Decimal digits for quality scores [default: 0]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Paired end options
_Option(
["-i", "fragment"],
"Fragment length (2 reads + insert) and standard deviation [default: 250 30]",
checker_function=lambda x: len(x.split()) == 2,
equate=False,
),
_Option(
["-v", "variation"],
"Structural variation penalty [default: 70]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# miRNA mode
_Option(
["-m", "miRNA"],
"Sets miRNA mode and optionally sets a value for the region scanned [default: off]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Multithreading
_Option(
["-c", "cores"],
"Number of threads, disabled on free versions [default: number of cores]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Quality calibrations
_Option(
["-k", "read_cal"],
"Read quality calibration from file (mismatch counts)",
checker_function=lambda x: isinstance(x, str),
equate=False,
),
_Option(
["-K", "write_cal"],
"Accumulate mismatch counts and write to file",
checker_function=lambda x: isinstance(x, str),
equate=False,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -1,63 +0,0 @@
# Copyright 2009 by Osvaldo Zagordi. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Sequencing related command line application wrappers (OBSOLETE).
We have decided to remove this module in future, and instead recommend
building your command and invoking it via the subprocess module directly.
"""
from ._bwa import BwaAlignCommandline
from ._bwa import BwaBwaswCommandline
from ._bwa import BwaIndexCommandline
from ._bwa import BwaMemCommandline
from ._bwa import BwaSampeCommandline
from ._bwa import BwaSamseCommandline
from ._Novoalign import NovoalignCommandline
from ._samtools import SamtoolsCalmdCommandline
from ._samtools import SamtoolsCatCommandline
from ._samtools import SamtoolsFaidxCommandline
from ._samtools import SamtoolsFixmateCommandline
from ._samtools import SamtoolsIdxstatsCommandline
from ._samtools import SamtoolsIndexCommandline
from ._samtools import SamtoolsMergeCommandline
from ._samtools import SamtoolsMpileupCommandline
from ._samtools import SamtoolsPhaseCommandline
from ._samtools import SamtoolsReheaderCommandline
from ._samtools import SamtoolsRmdupCommandline
from ._samtools import SamtoolsTargetcutCommandline
from ._samtools import SamtoolsVersion0xSortCommandline
from ._samtools import SamtoolsVersion0xSortCommandline as SamtoolsSortCommandline
from ._samtools import SamtoolsVersion1xSortCommandline
from ._samtools import SamtoolsViewCommandline
# Make this explicit, then they show up in the API docs
__all__ = (
"BwaIndexCommandline",
"BwaAlignCommandline",
"BwaSamseCommandline",
"BwaSampeCommandline",
"BwaBwaswCommandline",
"BwaMemCommandline",
"NovoalignCommandline",
"SamtoolsViewCommandline",
"SamtoolsCalmdCommandline",
"SamtoolsCatCommandline",
"SamtoolsFaidxCommandline",
"SamtoolsFixmateCommandline",
"SamtoolsIdxstatsCommandline",
"SamtoolsIndexCommandline",
"SamtoolsMergeCommandline",
"SamtoolsMpileupCommandline",
"SamtoolsPhaseCommandline",
"SamtoolsReheaderCommandline",
"SamtoolsRmdupCommandline",
"SamtoolsSortCommandline",
"SamtoolsVersion0xSortCommandline",
"SamtoolsVersion1xSortCommandline",
"SamtoolsTargetcutCommandline",
)

View File

@ -1,643 +0,0 @@
# Copyright 2013 Saket Choudhary. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for bwa."""
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _StaticArgument
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class BwaIndexCommandline(AbstractCommandline):
"""Command line wrapper for Burrows Wheeler Aligner (BWA) index.
Index database sequences in the FASTA format, equivalent to::
$ bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta>
See http://bio-bwa.sourceforge.net/bwa.shtml for details.
Examples
--------
>>> from Bio.Sequencing.Applications import BwaIndexCommandline
>>> reference_genome = "/path/to/reference_genome.fasta"
>>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw")
>>> print(index_cmd)
bwa index -a bwtsw /path/to/reference_genome.fasta
You would typically run the command using index_cmd() or via the
Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="bwa", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = [
_StaticArgument("index"),
_Option(
["-a", "a", "algorithm"],
"""Algorithm for constructing BWT index.
Available options are:
- is: IS linear-time algorithm for constructing suffix array.
It requires 5.37N memory where N is the size of the database.
IS is moderately fast, but does not work with database larger
than 2GB. IS is the default algorithm due to its simplicity.
- bwtsw: Algorithm implemented in BWT-SW. This method works with the
whole human genome, but it does not work with database
smaller than 10MB and it is usually slower than IS.""",
checker_function=lambda x: x in ["is", "bwtsw"],
equate=False,
is_required=True,
),
_Option(
["-p", "p", "prefix"],
"Prefix of the output database [same as db filename]",
equate=False,
is_required=False,
),
_Argument(["infile"], "Input file name", filename=True, is_required=True),
_Switch(
["-c", "c"],
"Build color-space index. The input fasta should be in nucleotide space.",
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
class BwaAlignCommandline(AbstractCommandline):
"""Command line wrapper for Burrows Wheeler Aligner (BWA) aln.
Run a BWA alignment, equivalent to::
$ bwa aln [...] <in.db.fasta> <in.query.fq> > <out.sai>
See http://bio-bwa.sourceforge.net/bwa.shtml for details.
Examples
--------
>>> from Bio.Sequencing.Applications import BwaAlignCommandline
>>> reference_genome = "/path/to/reference_genome.fasta"
>>> read_file = "/path/to/read_1.fq"
>>> output_sai_file = "/path/to/read_1.sai"
>>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file)
>>> print(align_cmd)
bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq
You would typically run the command line using align_cmd(stdout=output_sai_file)
or via the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="bwa", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = [
_StaticArgument("aln"),
_Argument(
["reference"], "Reference file name", filename=True, is_required=True
),
_Argument(["read_file"], "Read file name", filename=True, is_required=True),
_Option(
["-n", "n"],
"Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]",
checker_function=lambda x: isinstance(x, (int, float)),
equate=False,
),
_Option(
["-o", "o"],
"Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]",
checker_function=lambda x: isinstance(x, (int, float)),
equate=False,
),
_Option(
["-e", "e"],
"Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-d", "d"],
"Disallow a long deletion within INT bp towards the 3-end [16]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-i", "i"],
"Disallow an indel within INT bp towards the ends [5]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-l", "l"],
"""Take the first INT subsequence as seed.
If INT is larger than the query sequence, seeding will be disabled.
For long reads, this option is typically ranged from 25 to 35 for
-k 2. [inf]""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-k", "k"],
"Maximum edit distance in the seed [2]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-t", "t"],
"Number of threads (multi-threading mode) [1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-M", "M"],
"Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-O", "O"],
"Gap open penalty [11]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-E", "E"],
"Gap extension penalty [4]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-R", "R"],
"""Proceed with suboptimal alignments if there are no more than INT equally best hits.
This option only affects paired-end mapping. Increasing this threshold helps
to improve the pairing accuracy at the cost of speed, especially for short
reads (~32bp).""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-q", "q"],
r"""Parameter for read trimming [0].
BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT
where l is the original read length.""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-B", "B"],
"Length of barcode starting from the 5-end. When INT is positive, the barcode of each read will be trimmed before mapping and will be written at the BC SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Switch(
["-c", "c"],
"Reverse query but not complement it, which is required for alignment in the color space.",
),
_Switch(
["-N", "N"],
"Disable iterative search. All hits with no more than maxDiff differences will be found. This mode is much slower than the default.",
),
_Switch(
["-I", "I"],
"The input is in the Illumina 1.3+ read format (quality equals ASCII-64).",
),
_Switch(
["-b", "b"], "Specify the input read sequence file is the BAM format"
),
_Switch(
["-b1", "b1"],
"When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads).",
),
_Switch(
["-b2", "b2"],
"When -b is specified, only use the second read in a read pair in mapping.",
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
class BwaSamseCommandline(AbstractCommandline):
"""Command line wrapper for Burrows Wheeler Aligner (BWA) samse.
Generate alignments in the SAM format given single-end reads.
Equvialent to::
$ bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
See http://bio-bwa.sourceforge.net/bwa.shtml for details.
Examples
--------
>>> from Bio.Sequencing.Applications import BwaSamseCommandline
>>> reference_genome = "/path/to/reference_genome.fasta"
>>> read_file = "/path/to/read_1.fq"
>>> sai_file = "/path/to/read_1.sai"
>>> output_sam_file = "/path/to/read_1.sam"
>>> samse_cmd = BwaSamseCommandline(reference=reference_genome,
... read_file=read_file, sai_file=sai_file)
>>> print(samse_cmd)
bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq
You would typically run the command line using samse_cmd(stdout=output_sam_file)
or via the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="bwa", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = [
_StaticArgument("samse"),
_Argument(
["reference"], "Reference file name", filename=True, is_required=True
),
_Argument(["sai_file"], "Sai file name", filename=True, is_required=True),
_Argument(
["read_file"], "Read file name", filename=True, is_required=True
),
_Option(
["-n", "n"],
"""Maximum number of alignments to output in the XA tag for reads paired properly.
If a read has more than INT hits, the XA tag will not be written. [3]""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-r", "r"],
"Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
class BwaSampeCommandline(AbstractCommandline):
r"""Command line wrapper for Burrows Wheeler Aligner (BWA) sampe.
Generate alignments in the SAM format given paired-end reads.
Equivalent to::
$ bwa sampe [...] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam>
See http://bio-bwa.sourceforge.net/bwa.shtml for details.
Examples
--------
>>> from Bio.Sequencing.Applications import BwaSampeCommandline
>>> reference_genome = "/path/to/reference_genome.fasta"
>>> read_file1 = "/path/to/read_1.fq"
>>> read_file2 = "/path/to/read_2.fq"
>>> sai_file1 = "/path/to/read_1.sai"
>>> sai_file2 = "/path/to/read_2.sai"
>>> output_sam_file = "/path/to/output.sam"
>>> read_group = r"@RG\tID:foo\tSM:bar" # BWA will turn backslash-t into tab
>>> sampe_cmd = BwaSampeCommandline(reference=reference_genome,
... sai_file1=sai_file1, sai_file2=sai_file2,
... read_file1=read_file1, read_file2=read_file2,
... r=read_group)
>>> print(sampe_cmd)
bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG\tID:foo\tSM:bar
You would typically run the command line using sampe_cmd(stdout=output_sam_file)
or via the Python subprocess module, as described in the Biopython tutorial.
"""
# TODO - Should the read group have a raw tab in it, or \t?
def __init__(self, cmd="bwa", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = [
_StaticArgument("sampe"),
_Argument(
["reference"], "Reference file name", filename=True, is_required=True
),
_Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True),
_Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True),
_Argument(["read_file1"], "Read file 1", filename=True, is_required=True),
_Argument(["read_file2"], "Read file 2", filename=True, is_required=True),
_Option(
["-a", "a"],
"""Maximum insert size for a read pair to be considered being mapped properly [500].
Since 0.4.5, this option is only used when there are not enough
good alignments to infer the distribution of insert sizes.""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-o", "o"],
"""Maximum occurrences of a read for pairing [100000].
A read with more occurrences will be treated as a single-end read.
Reducing this parameter helps faster pairing.""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-n", "n"],
"""Maximum number of alignments to output in the XA tag for reads paired properly [3].
If a read has more than INT hits, the XA tag will not be written.""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-N", "N"],
"""Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10].
If a read has more than INT hits, the XA tag will not be written.""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-r", "r"],
"Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]",
checker_function=lambda x: isinstance(x, str),
equate=False,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
class BwaBwaswCommandline(AbstractCommandline):
"""Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw.
Align query sequences from FASTQ files. Equivalent to::
$ bwa bwasw [...] <in.db.fasta> <in.fq>
See http://bio-bwa.sourceforge.net/bwa.shtml for details.
Examples
--------
>>> from Bio.Sequencing.Applications import BwaBwaswCommandline
>>> reference_genome = "/path/to/reference_genome.fasta"
>>> read_file = "/path/to/read_1.fq"
>>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file)
>>> print(bwasw_cmd)
bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq
You would typically run the command line using bwasw_cmd() or via the
Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="bwa", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = [
_StaticArgument("bwasw"),
_Argument(
["reference"], "Reference file name", filename=True, is_required=True
),
_Argument(["read_file"], "Read file", filename=True, is_required=True),
_Argument(["mate_file"], "Mate file", filename=True, is_required=False),
_Option(
["-a", "a"],
"Score of a match [1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-b", "b"],
"Mismatch penalty [3]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-q", "q"],
"Gap open penalty [5]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-r", "r"],
"Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-t", "t"],
"Number of threads in the multi-threading mode [1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-w", "w"],
"Band width in the banded alignment [33]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-T", "T"],
"Minimum score threshold divided by a [37]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-c", "c"],
"""Coefficient for threshold adjustment according to query length [5.5].
Given an l-long query, the threshold for a hit to be retained is
a*max{T,c*log(l)}.""",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
_Option(
["-z", "z"],
"Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-s", "s"],
"""Maximum SA interval size for initiating a seed [3].
Higher -s increases accuracy at the cost of speed.""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-N", "N"],
"Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
class BwaMemCommandline(AbstractCommandline):
"""Command line wrapper for Burrows Wheeler Aligner (BWA) mem.
Run a BWA-MEM alignment, with single- or paired-end reads, equivalent to::
$ bwa mem [...] <in.db.fasta> <in1.fq> <in2.fq> > <out.sam>
See http://bio-bwa.sourceforge.net/bwa.shtml for details.
Examples
--------
>>> from Bio.Sequencing.Applications import BwaMemCommandline
>>> reference_genome = "/path/to/reference_genome.fasta"
>>> read_file = "/path/to/read_1.fq"
>>> output_sam_file = "/path/to/output.sam"
>>> align_cmd = BwaMemCommandline(reference=reference_genome, read_file1=read_file)
>>> print(align_cmd)
bwa mem /path/to/reference_genome.fasta /path/to/read_1.fq
You would typically run the command line using align_cmd(stdout=output_sam_file)
or via the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="bwa", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = [
_StaticArgument("mem"),
_Argument(
["reference"], "Reference file name", filename=True, is_required=True
),
_Argument(
["read_file1"], "Read 1 file name", filename=True, is_required=True
),
_Argument(
["read_file2"], "Read 2 file name", filename=True, is_required=False
),
_Option(
["-t", "t"],
"Number of threads [1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-k", "k"],
"Minimum seed length. Matches shorter than INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates 20. [19]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-w", "w"],
"Band width. Essentially, gaps longer than INT will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-d", "d"],
r"Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between the best and the current extension score is above \|i-j\|*A+INT, where i and j are the current positions of the query and reference, respectively, and A is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [100]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-r", "r"],
"Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]",
checker_function=lambda x: isinstance(x, (int, float)),
equate=False,
),
_Option(
["-c", "c"],
"Discard a MEM if it has more than INT occurrence in the genome. This is an insensitive parameter. [10000]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-A", "A"],
"Matching score. [1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-B", "B"],
"Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-O", "O"],
"Gap open penalty. [6]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-E", "E"],
"Gap extension penalty. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [1]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-L", "L"],
"Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [5]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-U", "U"],
"Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. [9] ",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-R", "R"],
"Complete read group header line. 't' can be used in STR and will be converted to a TAB in the output SAM. The read group ID will be attached to every read in the output. An example is '@RG\tID:foo\tSM:bar'. [null]",
checker_function=lambda x: isinstance(x, str),
equate=False,
),
_Option(
["-T", "T"],
"Don't output alignment with score lower than INT. This option only affects output. [30]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-v", "v"],
"Control the verbose level of the output. This option has not been fully supported throughout BWA. Ideally, a value 0 for disabling all the output to stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for all normal messages; 4 or higher for debugging. When this option takes value 4, the output is not SAM. [3]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Switch(
["-P", "P"],
"In the paired-end mode, perform SW to rescue missing hits only but do not try to find hits that fit a proper pair.",
),
_Switch(
["-p", "p"],
"Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.",
),
_Switch(
["-a", "a"],
"Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments.",
),
_Switch(
["-C", "C"],
"Append FASTA/Q comment to SAM output. This option can be used to transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.",
),
_Switch(
["-H", "H"],
"Use hard clipping 'H' in the SAM output. This option may dramatically reduce the redundancy of output when mapping long contig or BAC sequences.",
),
_Switch(
["-M", "M"],
"Mark shorter split hits as secondary (for Picard compatibility).",
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

File diff suppressed because it is too large Load Diff

View File

@ -1,15 +0,0 @@
# Copyright 2009 by Bartek Wilczynski. All rights reserved.
# Revisions copyright 2009 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Motif command line tool wrappers (OBSOLETE).
We have decided to remove this module in future, and instead recommend
building your command and invoking it via the subprocess module directly.
"""
from ._xxmotif import XXmotifCommandline

View File

@ -1,264 +0,0 @@
# Copyright 2012 by Christian Brueffer. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the motif finding program XXmotif."""
import os
from Bio.Application import _Argument
from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline
class XXmotifCommandline(AbstractCommandline):
"""Command line wrapper for XXmotif.
http://xxmotif.genzentrum.lmu.de/
Notes
-----
Last checked against version: 1.3
References
----------
Luehr S, Hartmann H, and Söding J. The XXmotif web server for eXhaustive,
weight matriX-based motif discovery in nucleotide sequences,
Nucleic Acids Res. 40: W104-W109 (2012).
Hartmann H, Guthoehrlein EW, Siebert M., Luehr S, and Söding J. P-value
based regulatory motif discovery using positional weight matrices,
Genome Res. 23: 181194 (2013)
Examples
--------
>>> from Bio.motifs.applications import XXmotifCommandline
>>> out_dir = "results"
>>> in_file = "sequences.fasta"
>>> xxmotif_cline = XXmotifCommandline(outdir=out_dir, seqfile=in_file, revcomp=True)
>>> print(xxmotif_cline)
XXmotif results sequences.fasta --revcomp
You would typically run the command line with xxmotif_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="XXmotif", **kwargs):
"""Initialize the class."""
# order of parameters is the same as in XXmotif --help
_valid_alphabet = set("ACGTNX")
self.parameters = [
_Argument(
["outdir", "OUTDIR"],
"output directory for all results",
filename=True,
is_required=True,
# XXmotif currently does not accept spaces in the outdir name
checker_function=lambda x: " " not in x,
),
_Argument(
["seqfile", "SEQFILE"],
"file name with sequences from positive set in FASTA format",
filename=True,
is_required=True,
# XXmotif currently only accepts a pure filename
checker_function=lambda x: os.path.split(x)[0] == "",
),
# Options
_Option(
["--negSet", "negSet", "NEGSET", "negset"],
"sequence set which has to be used as a reference set",
filename=True,
equate=False,
),
_Switch(
["--zoops", "ZOOPS", "zoops"],
"use zero-or-one occurrence per sequence model (DEFAULT)",
),
_Switch(
["--mops", "MOPS", "mops"], "use multiple occurrence per sequence model"
),
_Switch(
["--oops", "OOPS", "oops"], "use one occurrence per sequence model"
),
_Switch(
["--revcomp", "REVCOMP", "revcomp"],
"search in reverse complement of sequences as well (DEFAULT: NO)",
),
_Option(
[
"--background-model-order",
"background-model-order",
"BACKGROUND-MODEL-ORDER",
"background_model_order",
],
"order of background distribution (DEFAULT: 2, 8(--negset) )",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["--pseudo", "PSEUDO", "pseudo"],
"percentage of pseudocounts used (DEFAULT: 10)",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["-g", "--gaps", "GAPS", "gaps"],
"maximum number of gaps used for start seeds [0-3] (DEFAULT: 0)",
checker_function=lambda x: x in [0 - 3],
equate=False,
),
_Option(
["--type", "TYPE", "type"],
"defines what kind of start seeds are used (DEFAULT: ALL)"
"possible types: ALL, FIVEMERS, PALINDROME, TANDEM, NOPALINDROME, NOTANDEM",
checker_function=lambda x: x
in [
"ALL",
"all",
"FIVEMERS",
"fivemers",
"PALINDROME",
"palindrome",
"TANDEM",
"tandem",
"NOPALINDROME",
"nopalindrome",
"NOTANDEM",
"notandem",
],
equate=False,
),
_Option(
[
"--merge-motif-threshold",
"merge-motif-threshold",
"MERGE-MOTIF-THRESHOLD",
"merge_motif_threshold",
],
"defines the similarity threshold for merging motifs (DEFAULT: HIGH)"
"possible modes: LOW, MEDIUM, HIGH",
checker_function=lambda x: x
in ["LOW", "low", "MEDIUM", "medium", "HIGH", "high"],
equate=False,
),
_Switch(
[
"--no-pwm-length-optimization",
"no-pwm-length-optimization",
"NO-PWM-LENGTH-OPTIMIZATION",
"no_pwm_length_optimization",
],
"do not optimize length during iterations (runtime advantages)",
),
_Option(
[
"--max-match-positions",
"max-match-positions",
"MAX-MATCH-POSITIONS",
"max_match_positions",
],
"max number of positions per motif (DEFAULT: 17, higher values will lead to very long runtimes)",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Switch(
["--batch", "BATCH", "batch"],
"suppress progress bars (reduce output size for batch jobs)",
),
_Option(
["--maxPosSetSize", "maxPosSetSize", "MAXPOSSETSIZE", "maxpossetsize"],
"maximum number of sequences from the positive set used [DEFAULT: all]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# does not make sense in biopython
# _Switch(["--help", "help", "HELP"],
# "print this help page"),
_Option(
["--trackedMotif", "trackedMotif", "TRACKEDMOTIF", "trackedmotif"],
"inspect extensions and refinement of a given seed (DEFAULT: not used)",
checker_function=lambda x: any((c in _valid_alphabet) for c in x),
equate=False,
),
# Using conservation information
_Option(
["--format", "FORMAT", "format"],
"defines what kind of format the input sequences have (DEFAULT: FASTA)",
checker_function=lambda x: x in ["FASTA", "fasta", "MFASTA", "mfasta"],
equate=False,
),
_Option(
[
"--maxMultipleSequences",
"maxMultipleSequences",
"MAXMULTIPLESEQUENCES",
"maxmultiplesequences",
],
"maximum number of sequences used in an alignment [DEFAULT: all]",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Using localization information
_Switch(
["--localization", "LOCALIZATION", "localization"],
"use localization information to calculate combined P-values"
"(sequences should have all the same length)",
),
_Option(
["--downstream", "DOWNSTREAM", "downstream"],
"number of residues in positive set downstream of anchor point (DEFAULT: 0)",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Start with self defined motif
_Option(
["-m", "--startMotif", "startMotif", "STARTMOTIF", "startmotif"],
"Start motif (IUPAC characters)",
checker_function=lambda x: any((c in _valid_alphabet) for c in x),
equate=False,
),
_Option(
["-p", "--profileFile", "profileFile", "PROFILEFILE", "profilefile"],
"profile file",
filename=True,
equate=False,
),
_Option(
["--startRegion", "startRegion", "STARTREGION", "startregion"],
"expected start position for motif occurrences relative to anchor point (--localization)",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
_Option(
["--endRegion", "endRegion", "ENDREGION", "endregion"],
"expected end position for motif occurrences relative to anchor point (--localization)",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# XXmotif wrapper options
_Switch(
["--XXmasker", "masker"],
"mask the input sequences for homology, repeats and low complexity regions",
),
_Switch(
["--XXmasker-pos", "maskerpos"],
"mask only the positive set for homology, repeats and low complexity regions",
),
_Switch(
["--no-graphics", "nographics"], "run XXmotif without graphical output"
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -180,8 +180,9 @@ Bio.Data.PDBData instead.
Bio.Application and the command line wrappers using it
------------------------------------------------------
Declared obsolete in release 1.79, and deprecated in release 1.82. Please use
the standard library subprocess module directly instead.
Declared obsolete in release 1.79, deprecated in release 1.82, and removed
tn release 1.85. Please use the standard library subprocess module directly
instead.
Bio.Index
---------
@ -495,7 +496,8 @@ NCBI "legacy" BLAST tool wrappers FastacmdCommandline, BlastallCommandline,
BlastpgpCommandline and RpsBlastCommandline were declared obsolete in Release
1.53, deprecated in Release 1.61, and removed in Release 1.64, having been
replaced with wrappers for the new NCBI BLAST+ tools (e.g.
NcbiblastpCommandline and NcbipsiblastCommandline).
NcbiblastpCommandline and NcbipsiblastCommandline). This module was removed
in release 1.85 as it relied on Bio.Application, which was being removed.
Bio.Blast.ParseBlastTable
-------------------------

View File

@ -1,131 +0,0 @@
# Copyright 2013 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Bio.Application related tests for command line application wrappers.
This is intended to check generic things like argument parsing, and
stdin/stdout/stderr handling.
"""
import os
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Application import _Argument
from Bio.Application import AbstractCommandline
class EchoApp(AbstractCommandline):
"""Minimal command line wrapper for echo command."""
def __init__(self, cmd="echo", **kwargs):
"""Initialize wrapper for echo command."""
self.parameters = [_Argument(["text"], "Text to echo")]
AbstractCommandline.__init__(self, cmd, **kwargs)
class TestApp(unittest.TestCase):
def test_echo(self):
cline = EchoApp(text="Hello World")
stdout, stderr = cline()
self.assertEqual(stderr, "")
self.assertEqual(stdout, "Hello World\n")
def test_echo_capture_both(self):
cline = EchoApp(text="Hello World")
stdout, stderr = cline(stdout=True, stderr=True)
self.assertEqual(stderr, "")
self.assertEqual(stdout, "Hello World\n")
def test_echo_capture_stdout(self):
cline = EchoApp(text="Hello World")
stdout, stderr = cline(stdout=True, stderr=False)
self.assertIsNone(stderr)
self.assertEqual(stdout, "Hello World\n")
def test_echo_capture_stderr(self):
cline = EchoApp(text="Hello World")
stdout, stderr = cline(stdout=False, stderr=True)
self.assertEqual(stderr, "")
self.assertIsNone(stdout)
def test_echo_capture_neither(self):
cline = EchoApp(text="Hello World")
stdout, stderr = cline(stdout=False, stderr=False)
self.assertIsNone(stderr)
self.assertIsNone(stdout)
def test_echo_file_stdout(self):
cline = EchoApp(text="Hello World")
tmp = "echo_stdout.tmp"
if os.path.isfile(tmp):
os.remove(tmp)
stdout, stderr = cline(stdout=tmp)
self.assertEqual(stderr, "")
self.assertIsNone(stdout)
self.assertTrue(os.path.isfile(tmp))
with open(tmp) as handle:
contents = handle.read()
self.assertEqual(contents, "Hello World\n")
os.remove(tmp)
def test_echo_file_stderr(self):
cline = EchoApp(text="Hello World")
tmp = "echo_stderr.tmp"
if os.path.isfile(tmp):
os.remove(tmp)
stdout, stderr = cline(stderr=tmp)
self.assertIsNone(stderr)
self.assertEqual(stdout, "Hello World\n")
self.assertTrue(os.path.isfile(tmp))
with open(tmp) as handle:
contents = handle.read()
self.assertEqual(contents, "")
os.remove(tmp)
def test_echo_file_same(self):
cline = EchoApp(text="Hello World")
tmp = "echo_stdout_stderr.tmp"
if os.path.isfile(tmp):
os.remove(tmp)
stdout, stderr = cline(stdout=tmp, stderr=tmp)
self.assertIsNone(stderr)
self.assertIsNone(stdout)
self.assertTrue(os.path.isfile(tmp))
with open(tmp) as handle:
contents = handle.read()
self.assertEqual(contents, "Hello World\n") # stdout + stderr
os.remove(tmp)
def test_echo_file_both(self):
cline = EchoApp(text="Hello World")
tmp = "echo_stdout.tmp"
if os.path.isfile(tmp):
os.remove(tmp)
tmp2 = "echo_stderr.tmp"
if os.path.isfile(tmp2):
os.remove(tmp2)
stdout, stderr = cline(stdout=tmp, stderr=tmp2)
self.assertIsNone(stderr)
self.assertIsNone(stdout)
self.assertTrue(os.path.isfile(tmp), tmp)
with open(tmp) as handle:
contents = handle.read()
self.assertEqual(contents, "Hello World\n") # stdout
os.remove(tmp)
self.assertTrue(os.path.isfile(tmp2), tmp2)
with open(tmp2) as handle:
contents = handle.read()
self.assertEqual(contents, "") # stderr
os.remove(tmp2)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,211 +0,0 @@
# Copyright 2013 by Saket Choudhary. Based on test_Clustalw_tool.py by Peter
# Cock .
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for calling BWA."""
import os
import sys
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
# TODO from Bio.Sequencing.Applications import BwaBwaswCommandline
from Bio.Sequencing.Applications import BwaAlignCommandline
from Bio.Sequencing.Applications import BwaIndexCommandline
from Bio.Sequencing.Applications import BwaMemCommandline
from Bio.Sequencing.Applications import BwaSampeCommandline
from Bio.Sequencing.Applications import BwaSamseCommandline
#################################################################
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
bwa_exe = None
if sys.platform == "win32":
# TODO - Check the path?
try:
# This can vary depending on the Windows language.
prog_files = os.environ["PROGRAMFILES"]
except KeyError:
prog_files = r"C:\Program Files"
likely_dirs = ["bwa", "bwa-0.6.2", ""]
likely_exes = ["bwa"]
for folder in likely_dirs:
if os.path.isdir(os.path.join(prog_files, folder)):
for filename in likely_exes:
if os.path.isfile(os.path.join(prog_files, folder, filename)):
bwa_exe = os.path.join(prog_files, folder, filename)
break
if bwa_exe:
break
else:
from subprocess import getoutput
output = getoutput("bwa")
# Since "not found" may be in another language, try and be sure this is
# really the bwa tool's output
bwa_found = False
if "not found" not in output and "not recognized" not in output:
if "bwa" in output and "alignment via Burrows-Wheeler transformation" in output:
bwa_exe = "bwa"
skip_aln_tests = False
aln_output = getoutput("bwa aln")
if "unrecognized" in aln_output:
skip_aln_tests = True
print("'bwa aln' is unrecognized, skipping aln/samse/sampe tests")
if not bwa_exe:
raise MissingExternalDependencyError(
"Install bwa and correctly set"
" the file path to the program if"
" you want to use it from Biopython"
)
class BwaTestCase(unittest.TestCase):
"""Class for implementing BWA test cases."""
def setUp(self):
self.reference_file = "BWA/human_g1k_v37_truncated.fasta"
self.reference_extensions = ["amb", "ann", "bwt", "pac", "sa"]
self.infile1 = "BWA/HNSCC1_1_truncated.fastq"
self.infile2 = "BWA/HNSCC1_2_truncated.fastq"
self.saifile1 = "BWA/1.sai"
self.saifile2 = "BWA/2.sai"
self.samfile1 = "BWA/1.sam"
self.samfile2 = "BWA/2.sam"
self.samfile = "BWA/out.sam"
self.files_to_clean = [
self.saifile1,
self.saifile2,
self.samfile1,
self.samfile2,
self.samfile,
]
def tearDown(self):
for filename in self.files_to_clean:
if os.path.isfile(filename):
os.remove(filename)
for extension in self.reference_extensions:
index_file = self.reference_file + "." + extension
if os.path.exists(index_file):
os.remove(index_file)
def test_index(self):
"""Test for creating index files for the reference genome fasta file."""
cmdline = BwaIndexCommandline(bwa_exe)
cmdline.set_parameter("infile", self.reference_file)
cmdline.set_parameter("algorithm", "bwtsw")
stdout, stderr = cmdline()
for extension in self.reference_extensions:
index_file = self.reference_file + "." + extension
self.assertTrue(
os.path.exists(index_file), f"Index File {index_file} not found"
)
self.assertIn(
"Finished constructing BWT",
str(stdout) + str(stderr),
f"FASTA indexing failed:\n{cmdline}\nStdout:{stdout}\nStderr:{stderr}\n",
)
def do_aln(self, in_file, out_file):
"""Test for generating sai files given the reference and read file."""
cmdline = BwaAlignCommandline(bwa_exe)
cmdline.set_parameter("reference", self.reference_file)
cmdline.read_file = in_file
self.assertTrue(os.path.isfile(in_file))
stdout, stderr = cmdline(stdout=out_file)
self.assertNotIn(
"fail to locate the index",
str(stderr) + str(stdout),
"Error aligning sequence to reference:\n%s\nStdout:%s\nStderr:%s\n"
% (cmdline, stdout, stderr),
)
def create_fasta_index(self):
"""Test for generating index for fasta file.
BWA requires an indexed fasta for each alignment operation.
This should be called to create an index before any alignment
operation.
"""
cmdline = BwaIndexCommandline(bwa_exe)
cmdline.set_parameter("infile", self.reference_file)
cmdline.set_parameter("algorithm", "bwtsw")
stdout, stderr = cmdline()
if not skip_aln_tests:
def test_samse(self):
"""Test for single end sequencing."""
self.create_fasta_index()
self.do_aln(self.infile1, self.saifile1)
cmdline = BwaSamseCommandline(bwa_exe)
cmdline.set_parameter("reference", self.reference_file)
cmdline.set_parameter("read_file", self.infile1)
cmdline.set_parameter("sai_file", self.saifile1)
stdout, stderr = cmdline(stdout=self.samfile1)
with open(self.samfile1) as handle:
headline = handle.readline()
self.assertTrue(
headline.startswith("@SQ"),
f"Error generating sam files:\n{cmdline}\nOutput starts:{headline}",
)
def test_sampe(self):
"""Test for generating samfile by paired end sequencing."""
self.create_fasta_index()
# Generate sai files from paired end data
self.do_aln(self.infile1, self.saifile1)
self.do_aln(self.infile2, self.saifile2)
cmdline = BwaSampeCommandline(bwa_exe)
cmdline.set_parameter("reference", self.reference_file)
cmdline.set_parameter("sai_file1", self.saifile1)
cmdline.set_parameter("sai_file2", self.saifile2)
cmdline.set_parameter("read_file1", self.infile1)
cmdline.set_parameter("read_file2", self.infile2)
stdout, stderr = cmdline(stdout=self.samfile)
with open(self.samfile) as handle:
headline = handle.readline()
self.assertTrue(
headline.startswith("@SQ"),
f"Error generating sam files:\n{cmdline}\nOutput starts:{headline}",
)
def test_mem(self):
"""Test for generating samfile by paired end sequencing using BWA-MEM."""
self.create_fasta_index()
cmdline = BwaMemCommandline(bwa_exe)
cmdline.set_parameter("reference", self.reference_file)
cmdline.set_parameter("read_file1", self.infile1)
cmdline.set_parameter("read_file2", self.infile2)
stdout, stderr = cmdline(stdout=self.samfile)
with open(self.samfile) as handle:
headline = handle.readline()
self.assertTrue(
headline.startswith("@SQ"),
f"Error generating sam files:\n{cmdline}\nOutput starts:{headline}",
)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,399 +0,0 @@
# Copyright 2008-2011 by Peter Cock. All rights reserved.
# Revisions copyright 2012 by Christian Brueffer. All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for ClustalOmega tool."""
import os
import unittest
import warnings
from subprocess import getoutput
from Bio import Align
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio.Application import ApplicationError
#################################################################
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
clustalo_exe = None
try:
output = getoutput("clustalo --help")
if output.startswith("Clustal Omega"):
clustalo_exe = "clustalo"
except FileNotFoundError:
pass
if not clustalo_exe:
raise MissingExternalDependencyError(
"Install clustalo if you want to use Clustal Omega from Biopython."
)
class ClustalOmegaTestCase(unittest.TestCase):
def setUp(self):
self.files_to_clean = set()
def tearDown(self):
for filename in self.files_to_clean:
if os.path.isfile(filename):
os.remove(filename)
def standard_test_procedure(self, cline):
"""Shared test procedure used by all tests."""
# Overwrite existing files.
cline.force = True
# Mark output files for later cleanup.
self.add_file_to_clean(cline.outfile)
if cline.guidetree_out:
self.add_file_to_clean(cline.guidetree_out)
input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
self.assertEqual(str(eval(repr(cline))), str(cline))
output, error = cline()
self.assertTrue(not output or output.strip().startswith("CLUSTAL"))
# Test if ClustalOmega executed successfully.
self.assertTrue(
error.strip() == ""
or error.startswith(
(
"WARNING: Sequence type is DNA.",
"WARNING: DNA alignment is still experimental.",
)
)
)
# TODO - Try and parse this with Bio.Nexus?
if cline.guidetree_out:
self.assertTrue(os.path.isfile(cline.guidetree_out))
def add_file_to_clean(self, filename):
"""Add a file for deferred removal by the tearDown routine."""
self.files_to_clean.add(filename)
#################################################################
class ClustalOmegaTestErrorConditions(ClustalOmegaTestCase):
def test_empty_file(self):
"""Test an empty file."""
input_file = "does_not_exist.fasta"
self.assertFalse(os.path.isfile(input_file))
cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
message = str(err)
self.assertTrue(
"Cannot open sequence file" in message
or "Cannot open input file" in message
or "Non-zero return code" in message,
message,
)
else:
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
def test_single_sequence(self):
"""Test an input file containing a single sequence."""
input_file = "Fasta/f001"
self.assertTrue(os.path.isfile(input_file))
self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1)
cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
self.assertIn("contains 1 sequence, nothing to align", str(err))
else:
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
def test_invalid_format(self):
"""Test an input file in an invalid format."""
input_file = "Medline/pubmed_result1.txt"
self.assertTrue(os.path.isfile(input_file))
cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file)
with self.assertRaises(ApplicationError) as cm:
stdout, stderr = cline()
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
err = str(cm.exception)
# Ideally we'd catch the return code and raise the specific
# error for "invalid format".
self.assertIn("Can't determine format of sequence file", err)
#################################################################
class ClustalOmegaTestNormalConditions(ClustalOmegaTestCase):
def test_simple_fasta(self):
"""Test a simple fasta file."""
input_file = "Registry/seqs.fasta"
output_file = "temp_test.aln"
cline = ClustalOmegaCommandline(
clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal"
)
self.standard_test_procedure(cline)
alignment = Align.read(cline.outfile, "clustal")
self.assertEqual(
str(alignment),
"""\
gi|134891 0 GATCCCTACCCTTNCCGTTGGTCTCTNTCGCTGACTCGAGGCACCTAACATCCATTCACA
0 ---------..-........|......|....|......|..............|.----
gi|129628 0 ---------MP-VVVVASSKGGAGKSTTAVVLGTELAHKGVPVTMLDCDPNRSLTI----
gi|134891 60 CCCAACACAGGCCAGCGACTTCTGGGGCTCAGCCACAGACATGGTTTGTNACTNTTGAGC
60 -----.|.||.......|....|-------------------......|.......||..
gi|129628 46 -----WANAGEVPENITALSDVT-------------------ESSIVKTIKQHDVDGAVV
gi|134891 120 TTCTGTTCCTAGAGAATCCTAGAGGCTTGATTGGCCCAGGCTGCTGTNTGTNCTGGAGG-
120 ...--------..|.|......|..............|...|..............|..-
gi|129628 82 IVD--------LEGVASRMVSRAISQADLVLIPMRPKALDATIGAQSLQLIAEEEEAIDR
gi|134891 179 -CAAAGAATCCCTACCTCCTAGGGGTGAAAGGAAATNAAAATGGAAAGTTCTTGTAGCGC
180 -.|.|...|....|.......|........|------------...........||....
gi|129628 134 KIAHAVVFTMVSPAIRSHEYTGIKASLIENG------------VEIIEPPLVERTAYSAL
gi|134891 238 AAGGCCTGACATGGGTAGCTGCTCAATAAATGCTAGTNTGTTATTTC 285
240 ...|..........|..........|.|-----.|.....|.|..-- 287
gi|129628 182 FQFGGNLHSMKSKQGNMAAAIENAEAFA-----MAIFKKLTEALR-- 222
""",
)
self.assertEqual(
alignment.column_annotations["clustal_consensus"],
" * * * * * * ** * * * ** * * * * * * * * * * * * ** * * * * * * * ",
)
def test_properties(self):
"""Test setting options via properties."""
input_file = "Registry/seqs.fasta"
output_file = "temp_test.aln"
cline = ClustalOmegaCommandline(clustalo_exe)
cline.infile = input_file
cline.outfile = output_file
cline.outfmt = "clustal"
self.standard_test_procedure(cline)
alignment = Align.read(cline.outfile, "clustal")
self.assertEqual(
str(alignment),
"""\
gi|134891 0 GATCCCTACCCTTNCCGTTGGTCTCTNTCGCTGACTCGAGGCACCTAACATCCATTCACA
0 ---------..-........|......|....|......|..............|.----
gi|129628 0 ---------MP-VVVVASSKGGAGKSTTAVVLGTELAHKGVPVTMLDCDPNRSLTI----
gi|134891 60 CCCAACACAGGCCAGCGACTTCTGGGGCTCAGCCACAGACATGGTTTGTNACTNTTGAGC
60 -----.|.||.......|....|-------------------......|.......||..
gi|129628 46 -----WANAGEVPENITALSDVT-------------------ESSIVKTIKQHDVDGAVV
gi|134891 120 TTCTGTTCCTAGAGAATCCTAGAGGCTTGATTGGCCCAGGCTGCTGTNTGTNCTGGAGG-
120 ...--------..|.|......|..............|...|..............|..-
gi|129628 82 IVD--------LEGVASRMVSRAISQADLVLIPMRPKALDATIGAQSLQLIAEEEEAIDR
gi|134891 179 -CAAAGAATCCCTACCTCCTAGGGGTGAAAGGAAATNAAAATGGAAAGTTCTTGTAGCGC
180 -.|.|...|....|.......|........|------------...........||....
gi|129628 134 KIAHAVVFTMVSPAIRSHEYTGIKASLIENG------------VEIIEPPLVERTAYSAL
gi|134891 238 AAGGCCTGACATGGGTAGCTGCTCAATAAATGCTAGTNTGTTATTTC 285
240 ...|..........|..........|.|-----.|.....|.|..-- 287
gi|129628 182 FQFGGNLHSMKSKQGNMAAAIENAEAFA-----MAIFKKLTEALR-- 222
""",
)
self.assertEqual(
alignment.column_annotations["clustal_consensus"],
" * * * * * * ** * * * ** * * * * * * * * * * * * ** * * * * * * * ",
)
def test_input_filename_with_space(self):
"""Test an input filename containing a space."""
input_file = "Clustalw/temp horses.fasta"
with open(input_file, "w") as handle:
SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta")
output_file = "temp_test.aln"
cline = ClustalOmegaCommandline(
clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal"
)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
alignment = Align.read(cline.outfile, "clustal")
self.assertEqual(
str(alignment),
"""\
A 0 -CACACACAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40
B 0 -CACACAACAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40
C 0 -CACAACAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40
D 0 -CAACAAAACAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40
E 0 -CAACAAAAACAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40
F 0 ACAAAAAAAACACACAAAACAAAAAAAAAAAAAAAAAAAA- 40
G 0 ACAAAAAAAACACAACAAACAAAAAAAAAAAAAAAAAAAA- 40
H 0 ACAAAAAAAACAACAAAAACAAAAAAAAAAAAAAAAAAAA- 40
I 0 ACAAAAAAAAACAAAACAACAAAAAAAAAAAAAAAAAAAA- 40
J 0 ACAAAAAAAAACAAAAACACAAAAAAAAAAAAAAAAAAAA- 40
""",
)
self.assertEqual(
alignment.column_annotations["clustal_consensus"],
" ** ********************** ",
)
def test_output_filename_with_spaces(self):
"""Test an output filename containing spaces."""
input_file = "Registry/seqs.fasta"
output_file = "temp with spaces.aln"
cline = ClustalOmegaCommandline(
clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal"
)
self.standard_test_procedure(cline)
alignment = Align.read(cline.outfile, "clustal")
self.assertEqual(
str(alignment),
"""\
gi|134891 0 GATCCCTACCCTTNCCGTTGGTCTCTNTCGCTGACTCGAGGCACCTAACATCCATTCACA
0 ---------..-........|......|....|......|..............|.----
gi|129628 0 ---------MP-VVVVASSKGGAGKSTTAVVLGTELAHKGVPVTMLDCDPNRSLTI----
gi|134891 60 CCCAACACAGGCCAGCGACTTCTGGGGCTCAGCCACAGACATGGTTTGTNACTNTTGAGC
60 -----.|.||.......|....|-------------------......|.......||..
gi|129628 46 -----WANAGEVPENITALSDVT-------------------ESSIVKTIKQHDVDGAVV
gi|134891 120 TTCTGTTCCTAGAGAATCCTAGAGGCTTGATTGGCCCAGGCTGCTGTNTGTNCTGGAGG-
120 ...--------..|.|......|..............|...|..............|..-
gi|129628 82 IVD--------LEGVASRMVSRAISQADLVLIPMRPKALDATIGAQSLQLIAEEEEAIDR
gi|134891 179 -CAAAGAATCCCTACCTCCTAGGGGTGAAAGGAAATNAAAATGGAAAGTTCTTGTAGCGC
180 -.|.|...|....|.......|........|------------...........||....
gi|129628 134 KIAHAVVFTMVSPAIRSHEYTGIKASLIENG------------VEIIEPPLVERTAYSAL
gi|134891 238 AAGGCCTGACATGGGTAGCTGCTCAATAAATGCTAGTNTGTTATTTC 285
240 ...|..........|..........|.|-----.|.....|.|..-- 287
gi|129628 182 FQFGGNLHSMKSKQGNMAAAIENAEAFA-----MAIFKKLTEALR-- 222
""",
)
self.assertEqual(
alignment.column_annotations["clustal_consensus"],
" * * * * * * ** * * * ** * * * * * * * * * * * * ** * * * * * * * ",
)
def test_large_fasta_file(self):
"""Test a large fasta input file."""
# Create a large input file by converting another example file
# (See Bug 2804, this will produce so much output on stdout that
# subprocess could suffer a deadlock and hang). Using all the
# records should show the deadlock but is very slow - just thirty
# seems to lockup on Mac OS X, even 20 on Linux (without the fix).
input_file = "temp_cw_prot.fasta"
records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40]
with open(input_file, "w") as handle:
SeqIO.write(records, handle, "fasta")
del handle, records
output_file = "temp_cw_prot.aln"
cline = ClustalOmegaCommandline(
clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal"
)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
alignment = Align.read(cline.outfile, "clustal")
def test_newtree_files(self):
"""Test requesting a guide tree."""
input_file = "Fasta/f002"
output_file = "temp_test.aln"
newtree_file = "temp_test.dnd"
alignment_text = """\
gi|134891 0 CGGACCAGACGGACACAGGGAGAAGCTAGTTTCTTTCATGTGATTGANATNATGACTCTA
gi|134891 0 ---------CGGAGCCAGCGAGCATAT---------------------------------
gi|159293 0 ------------------------------------------------------------
gi|134891 60 CTCCTAAAAGGGAAAAANCAATATCCTTGTTTACAGAAGAGAAACAAACAAGCCCCACTC
gi|134891 18 ----------------------------------------------------GCTGCATG
gi|159293 0 --------------------------------------------GATCAAATCTGCACTG
gi|134891 120 AGCTCAGTCACAGGAGAGANCACAGAAAGTCTTAGGATCATGANCTCTGAA-AAAAAGAG
gi|134891 26 -------------------------AGGACCTTTCTATCTTACATTATGGC-TGGGAATC
gi|159293 16 TGTCTACATATAGGAAAGGTCCTGGTGTGTGCTAATGTTCCCAATGCAGGACTTGAGGAA
gi|134891 179 AAACCTTATCTTTNCTTTGTGGTTCCTTTAAACACACTCACACACACTTGGTCAGAGATG
gi|134891 60 TTACTCTTTCATCTG-------ATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATC
gi|159293 76 GAGCTCTGTTATATGTTTCCATTTCTCTTTATCAAAGATAACCAAACCTTATGGCCCTT-
gi|134891 239 CTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGC----------AGAGGGAC
gi|134891 113 CTGGTTTTACAGATGTGAAACTT----TCAAGAGATTTACTGACTTTCCTAGAATA----
gi|159293 135 ---ATAACAATGGAGGCACTGGCTGCCTCTTAATTTTCAATCATGGACCTAAAGAAGTAC
gi|134891 289 GTTTGA--GTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTT
gi|134891 165 --------GT--------------TTCTCTACTGGAAACCTGATGCTTTTATAAGCCATT
gi|159293 192 TCTGAAGGGTCTCAACAATGCCAGGTGGGGACAGATATACTCAGAGATTATCCAGGTCTG
gi|134891 347 ATGTNAAGCNTGACAACACCAGGCAGGTATGAGAGGA-AAGCAAGGCCCGTCCATNGCTG
gi|134891 203 GTGATTAGGATGACTGTTACAGGCTTAGCTTTGTGTGAAANCCAGTCACCTTT------C
gi|159293 252 CCTCCCAGCGAGCC-----------TGGA------GT-ACACCAGACCCTCCTAGAGAAA
gi|134891 406 TCCGTACNCTTACGGNTTGCTTGTNGGAGNCATTTNGGTATTGTTTGTTGTAANANCCAA
gi|134891 257 TCCTAGGTAATGAGTAGTGCTGTTCATATTACTNT-------AAGTTCTATAGCATACTT
gi|159293 294 TCTGTT------------------------------------ATAATTTACCACCCACTT
gi|134891 466 AANGGGCTTTGGNNTGGNAAAA----GGGCAGANNGGGGGGGTTGGTGTNGTTTTTTGG-
gi|134891 310 GCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGGAATTGNTT-----TGCCCTT
gi|159293 318 ATCCACCTTTAAACTTGGGGAA----GGNNGCN------TTTCAAATTAAATTTAATCNT
gi|134891 521 GGGGANNNTTTNGATTTGG-------TNCCGGGNTTTNGTTTNCCNCGGNACCGGNTTTT
gi|134891 365 TTG-GGTTTNTTNTTGGTAA--ANNNTTCCCGGGTGGGGGNGGTNNNGAAA---------
gi|159293 368 NGGGGGNTTTTAAACTTTAACCCTTTTNCCNTTNTNGGGGTNGGNANTTGNCCCCNTTAA
gi|134891 574 GGTTGGGGNCCATTTNTGNGGGGCNTTGGNGTTNCNTTNCCCNNNTNNGANTGGTTTNA
gi|134891 413 -----------------------------------------------------------
gi|159293 428 AGGGGGNNCCCCT-NCNNGGGGGAATAA-AACAA----------NTTNNTTT--TTT--
gi|134891 633
gi|134891 413
gi|159293 471
"""
clustal_consensus = " * * * * * * * * * ** ** * * * * * * * * * * * ** * * * * * * * ** * * * * ** * * ** * * **** * * * * * * * * * ** * * "
cline = ClustalOmegaCommandline(
clustalo_exe,
infile=input_file,
outfile=output_file,
guidetree_out=newtree_file,
outfmt="clustal",
)
self.standard_test_procedure(cline)
alignment = Align.read(cline.outfile, "clustal")
self.assertEqual(str(alignment), alignment_text)
self.assertEqual(
alignment.column_annotations["clustal_consensus"], clustal_consensus
)
cline.guidetree_out = "temp with space.dnd"
self.standard_test_procedure(cline)
alignment = Align.read(cline.outfile, "clustal")
self.assertEqual(str(alignment), alignment_text)
self.assertEqual(
alignment.column_annotations["clustal_consensus"], clustal_consensus
)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,327 +0,0 @@
# Copyright 2008-2011 by Peter Cock. All rights reserved.
# Revisions copyright 2012 by Christian Brueffer. All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
# TODO - Clean up the extra files created by clustalw? e.g. *.dnd
# and *.aln where we have not requested an explicit name?
"""Tests for Clustalw tool."""
import os
import sys
import unittest
import warnings
from Bio import AlignIO
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import ClustalwCommandline
from Bio.Application import ApplicationError
#################################################################
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
clustalw_exe = None
if sys.platform == "win32":
# TODO - Check the path?
try:
# This can vary depending on the Windows language.
prog_files = os.environ["PROGRAMFILES"]
except KeyError:
prog_files = r"C:\Program Files"
# Note that EBI's clustalw2 installer, e.g. clustalw-2.0.10-win.msi
# uses C:\Program Files\ClustalW2\clustalw2.exe so we should check
# for that.
#
# Some users doing a manual install have reported using
# C:\Program Files\clustalw.exe
#
# Older installers might use something like this,
# C:\Program Files\Clustalw\clustalw.exe
#
# One particular case is www.tc.cornell.edu currently provide a
# clustalw1.83 installer which uses the following long location:
# C:\Program Files\CTCBioApps\clustalw\v1.83\clustalw1.83.exe
likely_dirs = [
"ClustalW2",
"",
"Clustal",
"Clustalw",
"Clustalw183",
"Clustalw1.83",
r"CTCBioApps\clustalw\v1.83",
]
likely_exes = ["clustalw2.exe", "clustalw.exe", "clustalw1.83.exe"]
for folder in likely_dirs:
if os.path.isdir(os.path.join(prog_files, folder)):
for filename in likely_exes:
if os.path.isfile(os.path.join(prog_files, folder, filename)):
clustalw_exe = os.path.join(prog_files, folder, filename)
break
if clustalw_exe:
break
else:
from subprocess import getoutput
# Note that clustalw 1.83 and clustalw 2.1 don't obey the --version
# command, but this does cause them to quit cleanly. Otherwise they prompt
# the user for input (causing a lock up).
output = getoutput("clustalw2 --version")
# Since "not found" may be in another language, try and be sure this is
# really the clustalw tool's output
if "not found" not in output and "not recognized" not in output:
if "CLUSTAL" in output and "Multiple Sequence Alignments" in output:
clustalw_exe = "clustalw2"
if not clustalw_exe:
output = getoutput("clustalw --version")
if "not found" not in output and "not recognized" not in output:
if "CLUSTAL" in output and "Multiple Sequence Alignments" in output:
clustalw_exe = "clustalw"
if not clustalw_exe:
raise MissingExternalDependencyError(
"Install clustalw or clustalw2 if you want to use it from Biopython."
)
class ClustalWTestCase(unittest.TestCase):
"""Class implementing common functions for ClustalW tests."""
def setUp(self):
self.files_to_clean = set()
def tearDown(self):
for filename in self.files_to_clean:
if os.path.isfile(filename):
os.remove(filename)
def standard_test_procedure(self, cline):
"""Shared test procedure used by all tests."""
self.assertEqual(str(eval(repr(cline))), str(cline))
input_records = SeqIO.to_dict(
SeqIO.parse(cline.infile, "fasta"), lambda rec: rec.id.replace(":", "_")
) # noqa: E731
# Determine name of tree file
if cline.newtree:
tree_file = cline.newtree
else:
# Clustalw will name it based on the input file
tree_file = os.path.splitext(cline.infile)[0] + ".dnd"
# Mark generated files for later removal
self.add_file_to_clean(cline.outfile)
self.add_file_to_clean(tree_file)
output, error = cline()
self.assertTrue(output.strip().startswith("CLUSTAL"))
self.assertEqual(error.strip(), "")
# Check the output...
align = AlignIO.read(cline.outfile, "clustal")
# The length of the alignment will depend on the version of clustalw
# (clustalw 2.1 and clustalw 1.83 are certainly different).
output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
self.assertCountEqual(input_records.keys(), output_records.keys())
for record in align:
self.assertEqual(record.seq, output_records[record.id].seq)
self.assertEqual(
str(record.seq).replace("-", ""), input_records[record.id].seq
)
# Check the DND file was created.
# TODO - Try and parse this with Bio.Nexus?
self.assertTrue(os.path.isfile(tree_file))
def add_file_to_clean(self, filename):
"""Add a file for deferred removal by the tearDown routine."""
self.files_to_clean.add(filename)
class ClustalWTestErrorConditions(ClustalWTestCase):
"""Test general error conditions."""
def test_empty_file(self):
"""Test a non-existing input file."""
input_file = "does_not_exist.fasta"
self.assertFalse(os.path.isfile(input_file))
cline = ClustalwCommandline(clustalw_exe, infile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
message = str(err)
self.assertTrue(
"Cannot open sequence file" in message
or "Cannot open input file" in message
or "Non-zero return code " in message,
message,
)
else:
self.fail("expected an ApplicationError")
def test_single_sequence(self):
"""Test an input file containing a single sequence."""
input_file = "Fasta/f001"
self.assertTrue(os.path.isfile(input_file))
self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1)
cline = ClustalwCommandline(clustalw_exe, infile=input_file)
try:
stdout, stderr = cline()
# Zero return code is a possible bug in clustalw 2.1?
self.assertIn("cannot do multiple alignment", (stdout + stderr))
except ApplicationError as err:
# Good, non-zero return code indicating an error in clustalw
# e.g. Using clustalw 1.83 get:
# Command 'clustalw -infile=Fasta/f001' returned non-zero exit status 4
pass
if os.path.isfile(input_file + ".aln"):
# Clustalw 2.1 made an empty aln file, clustalw 1.83 did not
self.add_file_to_clean(input_file + ".aln")
def test_invalid_sequence(self):
"""Test an input file containing an invalid sequence."""
input_file = "Medline/pubmed_result1.txt"
self.assertTrue(os.path.isfile(input_file))
cline = ClustalwCommandline(clustalw_exe, infile=input_file)
with self.assertRaises(ApplicationError) as cm:
stdout, stderr = cline()
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
err = str(cm.exception)
# Ideally we'd catch the return code and raise the specific
# error for "invalid format", rather than just notice there
# is not output file.
# Note:
# Python 2.3 on Windows gave (0, 'Error')
# Python 2.5 on Windows gives [Errno 0] Error
self.assertTrue(
"invalid format" in err
or "not produced" in err
or "No sequences in file" in err
or "Non-zero return code " in err
)
class ClustalWTestNormalConditions(ClustalWTestCase):
"""Tests for normal conditions."""
def test_properties(self):
"""Test passing options via properties."""
cline = ClustalwCommandline(clustalw_exe)
cline.infile = "Fasta/f002"
cline.outfile = "temp_test.aln"
cline.align = True
self.standard_test_procedure(cline)
def test_simple_fasta(self):
"""Test a simple fasta input file."""
input_file = "Fasta/f002"
output_file = "temp_test.aln"
cline = ClustalwCommandline(
clustalw_exe, infile=input_file, outfile=output_file
)
self.standard_test_procedure(cline)
def test_newtree(self):
"""Test newtree files."""
input_file = "Registry/seqs.fasta"
output_file = "temp_test.aln"
newtree_file = "temp_test.dnd"
cline = ClustalwCommandline(
clustalw_exe,
infile=input_file,
outfile=output_file,
newtree=newtree_file,
align=True,
)
self.standard_test_procedure(cline)
cline.newtree = "temp with space.dnd"
self.standard_test_procedure(cline)
def test_large_input_file(self):
"""Test a large input file."""
# Create a large input file by converting another example file
# (See Bug 2804, this will produce so much output on stdout that
# subprocess could suffer a deadlock and hang). Using all the
# records should show the deadlock but is very slow - just thirty
# seems to lockup on Mac OS X, even 20 on Linux (without the fix).
input_file = "temp_cw_prot.fasta"
records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40]
with open(input_file, "w") as handle:
SeqIO.write(records, handle, "fasta")
del records
output_file = "temp_cw_prot.aln"
cline = ClustalwCommandline(
clustalw_exe, infile=input_file, outfile=output_file
)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
def test_input_filename_with_space(self):
"""Test an input filename containing a space."""
input_file = "Clustalw/temp horses.fasta"
with open(input_file, "w") as handle:
SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta")
output_file = "temp with space.aln"
cline = ClustalwCommandline(
clustalw_exe, infile=input_file, outfile=output_file
)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
def test_output_filename_with_spaces(self):
"""Test an output filename containing spaces."""
input_file = "GFF/multi.fna"
output_file = "temp with space.aln"
cline = ClustalwCommandline(
clustalw_exe, infile=input_file, outfile=output_file
)
self.standard_test_procedure(cline)
class ClustalWTestVersionTwoSpecific(ClustalWTestCase):
"""Tests specific to ClustalW2."""
def test_statistics(self):
"""Test a statistics file."""
if clustalw_exe == "clustalw2":
input_file = "Fasta/f002"
output_file = "temp_test.aln"
statistics_file = "temp_stats.txt"
cline = ClustalwCommandline(
clustalw_exe,
infile=input_file,
outfile=output_file,
stats=statistics_file,
)
self.add_file_to_clean(statistics_file)
self.standard_test_procedure(cline)
self.assertTrue(os.path.isfile(statistics_file))
else:
print("Skipping ClustalW2 specific test.")
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,123 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
# Revisions 2009 copyright by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unittests for Bio.Align.Applications interface for DIALIGN2-2."""
import os
import sys
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import DialignCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
dialign_exe = None
if sys.platform == "win32":
raise MissingExternalDependencyError("DIALIGN2-2 not available on Windows")
else:
from subprocess import getoutput
output = getoutput("dialign2-2")
if "not found" not in output and "not recognized" not in output:
if "dialign2-2" in output.lower():
dialign_exe = "dialign2-2"
if "DIALIGN2_DIR" not in os.environ:
raise MissingExternalDependencyError(
"Environment variable DIALIGN2_DIR for DIALIGN2-2 missing."
)
if not os.path.isdir(os.environ["DIALIGN2_DIR"]):
raise MissingExternalDependencyError(
"Environment variable DIALIGN2_DIR for DIALIGN2-2 is not a valid directory."
)
if not os.path.isfile(os.path.join(os.environ["DIALIGN2_DIR"], "BLOSUM")):
raise MissingExternalDependencyError(
"Environment variable DIALIGN2_DIR directory missing BLOSUM file."
)
# TODO - check for tp400_dna, tp400_prot and tp400_trans too?
if not dialign_exe:
raise MissingExternalDependencyError(
"Install DIALIGN2-2 if you want to use the Bio.Align.Applications wrapper."
)
class DialignApplication(unittest.TestCase):
def setUp(self):
self.infile1 = "Fasta/f002"
# Standard output file
self.outfile1 = "Fasta/f002.ali"
# MSF output
self.outfile2 = "Fasta/f002.ms"
def tearDown(self):
if os.path.isfile(self.outfile1):
os.remove(self.outfile1)
if os.path.isfile(self.outfile2):
os.remove(self.outfile2)
def test_Dialign_simple(self):
"""Simple round-trip through app with infile."""
# Test using keyword arguments:
cmdline = DialignCommandline(dialign_exe, input=self.infile1)
self.assertEqual(str(cmdline), dialign_exe + " Fasta/f002")
stdout, stderr = cmdline()
self.assertEqual(stderr, "")
self.assertEqual(stdout, "")
self.assertTrue(os.path.exists(self.outfile1))
def test_Dialign_simple_with_options(self):
"""Simple round-trip through app with infile and options."""
cmdline = DialignCommandline(dialign_exe)
cmdline.set_parameter("input", self.infile1)
cmdline.set_parameter("-max_link", True)
cmdline.set_parameter("stars", 4)
self.assertEqual(str(cmdline), dialign_exe + " -max_link -stars 4 Fasta/f002")
stdout, stderr = cmdline()
self.assertEqual(stderr, "")
self.assertEqual(stdout, "")
self.assertTrue(os.path.exists(self.outfile1))
def test_Dialign_simple_with_MSF_output(self):
"""Simple round-trip through app with infile, output MSF."""
cmdline = DialignCommandline(dialign_exe)
# Test with properties
cmdline.input = self.infile1
cmdline.msf = True
self.assertEqual(str(cmdline), dialign_exe + " -msf Fasta/f002")
stdout, stderr = cmdline()
self.assertEqual(stderr, "")
self.assertEqual(stdout, "")
self.assertTrue(os.path.exists(self.outfile1))
self.assertTrue(os.path.exists(self.outfile2))
def test_Dialign_complex_command_line(self):
"""Round-trip through app with complex command line."""
cmdline = DialignCommandline(dialign_exe)
cmdline.set_parameter("input", self.infile1)
cmdline.set_parameter("-nt", True)
cmdline.set_parameter("-thr", 4)
cmdline.set_parameter("stars", 9)
cmdline.set_parameter("-ow", True)
cmdline.set_parameter("mask", True)
cmdline.set_parameter("-cs", True)
self.assertEqual(
str(cmdline), dialign_exe + " -cs -mask -nt -ow -stars 9 -thr 4 Fasta/f002"
)
stdout, stderr = cmdline()
self.assertEqual(stderr, "")
self.assertTrue(os.path.exists(self.outfile1))
self.assertTrue(stdout.startswith(" e_len = 633"))
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,968 +0,0 @@
# Copyright 2009 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Runs a few EMBOSS tools to check our wrappers and parsers."""
import os
import subprocess
import sys
import unittest
import warnings
from io import StringIO
from Bio import BiopythonDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Application import _escape_filename
from Bio.Emboss.Applications import NeedleCommandline
from Bio.Emboss.Applications import SeqmatchallCommandline
from Bio.Emboss.Applications import SeqretCommandline
from Bio.Emboss.Applications import WaterCommandline
from Bio import AlignIO
from Bio import MissingExternalDependencyError
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Seq import translate
from Bio.SeqRecord import SeqRecord
# ###############################################################
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
exes_wanted = ["water", "needle", "seqret", "transeq", "seqmatchall", "embossversion"]
exes = {} # Dictionary mapping from names to exe locations
if "EMBOSS_ROOT" in os.environ:
# Windows default installation path is C:\mEMBOSS which contains the exes.
# EMBOSS also sets an environment variable which we will check for.
path = os.environ["EMBOSS_ROOT"]
if os.path.isdir(path):
for name in exes_wanted:
if os.path.isfile(os.path.join(path, name + ".exe")):
exes[name] = os.path.join(path, name + ".exe")
del name
else:
raise MissingExternalDependencyError(
f"$EMBOSS_ROOT={path!r} which does not exist!"
)
del path
if sys.platform != "win32":
from subprocess import getoutput
for name in exes_wanted:
# This will "just work" if installed on the path as normal on Unix
output = getoutput(f"{name} -help")
if "not found" not in output and "not recognized" not in output:
exes[name] = name
del output
del name
if len(exes) < len(exes_wanted):
raise MissingExternalDependencyError(
"Install EMBOSS if you want to use Bio.Emboss."
)
def get_emboss_version():
"""Return a tuple of three ints, e.g. (6,1,0)."""
# Windows and Unix versions of EMBOSS seem to differ in
# which lines go to stdout and stderr - so merge them.
child = subprocess.Popen(
_escape_filename(exes["embossversion"]),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdout, stderr = child.communicate()
child.stdout.close() # This is both stdout and stderr
del child
assert stderr is None # Send to stdout instead
for line in stdout.split("\n"):
if line.strip() == "Report the current EMBOSS version number":
# e.g.
# $ embossversion
# Report the current EMBOSS version number
# 6.5.7.0
pass
elif line.strip() == "Reports the current EMBOSS version number":
# e.g.
# $ embossversion
# Reports the current EMBOSS version number
# 6.3.1
pass
elif line.startswith("Writes the current EMBOSS version number"):
pass
elif line.count(".") == 2:
return tuple(int(v) for v in line.strip().split("."))
elif line.count(".") == 3:
# e.g. I installed mEMBOSS-6.2.0.1-setup.exe
# which reports 6.2.0.1 - for this return (6,2,0)
return tuple(int(v) for v in line.strip().split("."))[:3]
else:
# Either we can't understand the output, or this is really
# an error message not caught earlier (e.g. not in English)
raise MissingExternalDependencyError(
f"Install EMBOSS if you want to use Bio.Emboss ({line})."
)
# In case there was no output at all...
raise MissingExternalDependencyError("Could not get EMBOSS version")
# To avoid confusing known errors from old versions of EMBOSS ...
emboss_version = get_emboss_version()
if emboss_version < (6, 1, 0):
raise MissingExternalDependencyError("Test requires EMBOSS 6.1.0 patch 3 or later.")
#################################################################
# Top level function as this makes it easier to use for debugging:
def emboss_piped_SeqIO_convert(records, old_format, new_format):
"""Run seqret, returns records (as a generator)."""
# Setup, this assumes for all the format names used
# Biopython and EMBOSS names are consistent!
cline = SeqretCommandline(
exes["seqret"],
sformat=old_format,
osformat=new_format,
auto=True, # no prompting
filter=True,
)
# Run the tool,
child = subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
SeqIO.write(records, child.stdin, old_format)
child.stdin.close()
child.stderr.close()
records = SeqIO.parse(child.stdout, new_format)
yield from records
child.stdout.close()
# Top level function as this makes it easier to use for debugging:
def emboss_piped_AlignIO_convert(alignments, old_format, new_format):
"""Run seqret, returns alignments (as a generator)."""
# Setup, this assumes for all the format names used
# Biopython and EMBOSS names are consistent!
cline = SeqretCommandline(
exes["seqret"],
sformat=old_format,
osformat=new_format,
auto=True, # no prompting
filter=True,
)
# Run the tool,
with subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
) as child:
AlignIO.write(alignments, child.stdin, old_format)
child.stdin.close()
aligns = list(AlignIO.parse(child.stdout, new_format))
return aligns
class SeqRetTests(unittest.TestCase):
"""Base class providing SeqRecord comparison method."""
def compare_records(self, old_records, new_records, msg=None):
self.assertEqual(len(old_records), len(new_records), msg)
for old, new in zip(old_records, new_records):
# Note the name matching is a bit fuzzy, e.g. truncation and
# no spaces in PHYLIP files.
self.assertTrue(
(old.id in new.id)
or (new.id in old.id)
or (old.id.replace(" ", "_") == new.id.replace(" ", "_"))
or (old.name == new.name),
msg,
)
self.assertEqual(len(old.seq), len(new.seq), msg)
if old.seq.upper() != new.seq.upper():
raise Exception
if str(old.seq).replace("X", "N") == str(new.seq):
self.fail(f"{msg}: X -> N (protein forced into nucleotide?)")
else:
self.assertEqual(old.seq, new.seq, msg)
if old.features and new.features:
self.assertEqual(len(old.features), len(new.features), msg)
# TODO - check annotation
class SeqRetSeqIOTests(SeqRetTests):
"""Check EMBOSS seqret against Bio.SeqIO for converting files."""
def tearDown(self):
clean_up()
def check_SeqIO_to_EMBOSS(self, in_filename, in_format, skip_formats=()):
"""Check SeqIO writes files seqret can read back."""
records = list(SeqIO.parse(in_filename, in_format))
for temp_format in ["genbank", "embl", "fasta"]:
if temp_format in skip_formats:
continue
new_records = list(
emboss_piped_SeqIO_convert(records, temp_format, "fasta")
)
msg = f"converting {in_filename} from {in_format} to {temp_format}"
self.compare_records(records, new_records, msg)
def check_EMBOSS_to_SeqIO(self, filename, old_format, skip_formats=()):
"""Check SeqIO can read read seqret's conversion output."""
# TODO: Why can't we read EMBOSS's swiss output?
self.assertTrue(os.path.isfile(filename))
old_records = list(SeqIO.parse(filename, old_format))
for new_format in ["genbank", "fasta", "pir", "embl", "ig"]:
if new_format in skip_formats:
continue
cline = SeqretCommandline(
exes["seqret"],
sequence=filename,
sformat=old_format,
osformat=new_format,
auto=True, # no prompting
stdout=True,
)
# Run the tool,
with subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
) as child:
child.stdin.close()
new_records = list(SeqIO.parse(child.stdout, new_format))
msg = f"converting {filename} from {old_format} to {new_format}"
self.compare_records(old_records, new_records, msg)
def check_SeqIO_with_EMBOSS(self, filename, old_format, skip_formats=()):
# Check EMBOSS can read Bio.SeqIO output...
self.check_SeqIO_to_EMBOSS(filename, old_format, skip_formats)
# Check Bio.SeqIO can read EMBOSS seqret output...
self.check_EMBOSS_to_SeqIO(filename, old_format, skip_formats)
def test_abi(self):
"""Check SeqIO agrees with EMBOSS' Abi to FASTQ conversion."""
# This lets use check the id, sequence, and quality scores
for filename in ["Abi/3730.ab1", "Abi/empty.ab1"]:
old = SeqIO.read(filename, "abi")
cline = SeqretCommandline(
exes["seqret"],
sequence=filename,
sformat="abi",
osformat="fastq-sanger",
auto=True, # no prompting
stdout=True,
)
# Run the tool,
with subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
) as child:
child.stdin.close()
new = SeqIO.read(child.stdout, "fastq-sanger")
if emboss_version == (6, 4, 0) and new.id == "EMBOSS_001":
# Avoid bug in EMBOSS 6.4.0 (patch forthcoming)
pass
else:
self.assertEqual(old.id, new.id)
self.assertEqual(old.seq, new.seq)
if emboss_version < (6, 3, 0) and new.letter_annotations[
"phred_quality"
] == [1] * len(old):
# Apparent bug in EMBOSS 6.2.0.1 on Windows
pass
else:
self.assertEqual(old.letter_annotations, new.letter_annotations)
def test_genbank(self):
"""Check SeqIO & EMBOSS reading each other's conversions of a GenBank file."""
self.check_SeqIO_with_EMBOSS("GenBank/cor6_6.gb", "genbank")
def test_genbank2(self):
"""Check SeqIO & EMBOSS reading each other's conversions of another GenBank file."""
self.check_SeqIO_with_EMBOSS("GenBank/NC_000932.gb", "genbank")
def test_embl(self):
"""Check SeqIO & EMBOSS reading each other's conversions of an EMBL file."""
self.check_SeqIO_with_EMBOSS("EMBL/U87107.embl", "embl")
def test_ig(self):
"""Check SeqIO & EMBOSS reading each other's conversions of an ig file."""
# NOTE - EMBOSS considers "genbank" to be for nucleotides only,
# and will turn "X" into "N" for GenBank output.
self.check_SeqIO_to_EMBOSS(
"IntelliGenetics/VIF_mase-pro.txt", "ig", skip_formats=["genbank", "embl"]
)
# TODO - What does a % in an ig sequence mean?
# e.g. "IntelliGenetics/vpu_nucaligned.txt"
# and "IntelliGenetics/TAT_mase_nuc.txt"
# EMBOSS seems to ignore them.
def test_pir(self):
"""Check SeqIO & EMBOSS reading each other's conversions of a PIR file."""
# Skip genbank here, EMBOSS mangles the LOCUS line:
self.check_SeqIO_with_EMBOSS(
"NBRF/clustalw.pir", "pir", skip_formats=["genbank"]
)
# Skip EMBL here, EMBOSS mangles the ID line
# Skip GenBank, EMBOSS 6.0.1 on Windows won't output proteins as GenBank
self.check_SeqIO_with_EMBOSS(
"NBRF/DMB_prot.pir", "pir", skip_formats=["embl", "genbank"]
)
def test_clustalw(self):
"""Check SeqIO & EMBOSS reading each other's conversions of a Clustalw file."""
self.check_SeqIO_with_EMBOSS(
"Clustalw/hedgehog.aln", "clustal", skip_formats=["embl", "genbank"]
)
self.check_SeqIO_with_EMBOSS(
"Clustalw/opuntia.aln", "clustal", skip_formats=["embl", "genbank"]
)
class SeqRetAlignIOTests(SeqRetTests):
"""Check EMBOSS seqret against Bio.AlignIO for converting files."""
def tearDown(self):
clean_up()
def compare_alignments(self, old_list, new_list, msg=None):
self.assertEqual(len(old_list), len(new_list), msg)
for old, new in zip(old_list, new_list):
self.compare_records(old, new, msg)
def check_EMBOSS_to_AlignIO(self, filename, old_format, skip_formats=()):
"""Check AlignIO can read seqret's conversion of the file."""
self.assertTrue(os.path.isfile(filename), filename)
old_aligns = list(AlignIO.parse(filename, old_format))
formats = ["clustal", "phylip", "ig", "msf"]
if len(old_aligns) == 1:
formats.extend(["fasta", "nexus"])
for new_format in formats:
if new_format in skip_formats:
continue
cline = SeqretCommandline(
exes["seqret"],
sequence=filename,
sformat=old_format,
osformat=new_format,
auto=True, # no prompting
stdout=True,
)
# Run the tool,
with subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
) as child:
child.stdin.close()
new_aligns = list(AlignIO.parse(child.stdout, new_format))
msg = f"converting {filename} from {old_format} to {new_format}"
self.compare_alignments(old_aligns, new_aligns, msg)
def check_AlignIO_to_EMBOSS(self, in_filename, in_format, skip_formats=()):
"""Check Bio.AlignIO can write files seqret can read."""
old_aligns = list(AlignIO.parse(in_filename, in_format))
formats = ["clustal", "phylip"]
if len(old_aligns) == 1:
formats.extend(["fasta", "nexus"])
for temp_format in formats:
if temp_format in skip_formats:
continue
# PHYLIP is a simple format which explicitly supports
# multiple alignments (unlike FASTA).
try:
new_aligns = list(
emboss_piped_AlignIO_convert(old_aligns, temp_format, "phylip")
)
except ValueError as e:
self.assertIn(
str(e),
(
"Need the molecule type to be defined",
"Repeated name 'AT3G20900.' (originally 'AT3G20900.1-SEQ'), possibly due to truncation",
"Repeated name 'gi|1377497' (originally 'gi|13774975|gb|AAK39115.1|AF35'), possibly due to truncation",
"Repeated name 'gi_1393639' (originally 'gi_13936397_dbj_BAB47195.'), possibly due to truncation",
),
)
continue
msg = f"converting {in_filename} from {in_format} to {temp_format}"
self.compare_alignments(old_aligns, new_aligns, msg)
def check_AlignIO_with_EMBOSS(self, filename, old_format, skip_formats=()):
# Check EMBOSS can read Bio.AlignIO output...
self.check_AlignIO_to_EMBOSS(filename, old_format, skip_formats)
# Check Bio.AlignIO can read EMBOSS seqret output...
self.check_EMBOSS_to_AlignIO(filename, old_format, skip_formats)
def test_align_clustalw(self):
"""Check AlignIO & EMBOSS reading each other's conversions of a ClustalW file."""
self.check_AlignIO_with_EMBOSS("Clustalw/hedgehog.aln", "clustal")
self.check_AlignIO_with_EMBOSS("Clustalw/opuntia.aln", "clustal")
self.check_AlignIO_with_EMBOSS(
"Clustalw/odd_consensus.aln", "clustal", skip_formats=["nexus"]
) # TODO - why not nexus?
self.check_AlignIO_with_EMBOSS("Clustalw/protein.aln", "clustal")
self.check_AlignIO_with_EMBOSS("Clustalw/promals3d.aln", "clustal")
def test_clustalw(self):
"""Check AlignIO & EMBOSS reading each other's conversions of a PHYLIP file."""
self.check_AlignIO_with_EMBOSS("Phylip/horses.phy", "phylip")
self.check_AlignIO_with_EMBOSS("Phylip/hennigian.phy", "phylip")
self.check_AlignIO_with_EMBOSS("Phylip/reference_dna.phy", "phylip")
self.check_AlignIO_with_EMBOSS("Phylip/reference_dna2.phy", "phylip")
self.check_AlignIO_with_EMBOSS("Phylip/interlaced.phy", "phylip")
self.check_AlignIO_with_EMBOSS("Phylip/interlaced2.phy", "phylip")
self.check_AlignIO_with_EMBOSS("Phylip/random.phy", "phylip")
class PairwiseAlignmentTests(unittest.TestCase):
"""Run pairwise alignments with water and needle, and parse them."""
def tearDown(self):
clean_up()
def pairwise_alignment_check(self, query_seq, targets, alignments, local=True):
"""Check pairwise alignment data is sane."""
# The datasets should be small, so making iterators into lists is OK
targets = list(targets)
alignments = list(alignments)
self.assertEqual(len(targets), len(alignments))
for target, alignment in zip(targets, alignments):
self.assertEqual(len(alignment), 2)
# self.assertEqual(target.id, alignment[1].id) #too strict
msg = f"{alignment[1].id} vs {target.id} or {target.name}"
self.assertTrue(
alignment[1].id in target.id or alignment[1].id in target.name, msg=msg
)
if local:
# Local alignment
self.assertIn(str(alignment[0].seq).replace("-", ""), query_seq)
self.assertIn(
str(alignment[1].seq).replace("-", "").upper(), target.seq.upper()
)
else:
# Global alignment
self.assertEqual(query_seq, str(alignment[0].seq).replace("-", ""))
self.assertEqual(
target.seq.upper(), str(alignment[1].seq).replace("-", "").upper()
)
return True
def run_water(self, cline):
# Run the tool,
stdout, stderr = cline()
self.assertTrue(
stderr.strip().startswith("Smith-Waterman local alignment"), stderr
)
if cline.outfile:
self.assertEqual(stdout.strip(), "")
self.assertTrue(
os.path.isfile(cline.outfile),
f"Missing output file {cline.outfile!r} from:\n{cline}",
)
else:
# Don't use this yet... could return stdout handle instead?
return stdout
def test_water_file(self):
"""Run water with the asis trick, output to a file."""
# Setup, try a mixture of keyword arguments and later additions:
cline = WaterCommandline(cmd=exes["water"], gapopen="10", gapextend="0.5")
# Try using both human readable names, and the literal ones:
cline.set_parameter("asequence", "asis:ACCCGGGCGCGGT")
cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
# Try using a property set here:
cline.outfile = "Emboss/temp with space.water"
self.assertEqual(str(eval(repr(cline))), str(cline))
# Run the tool,
self.run_water(cline)
# Check we can parse the output...
align = AlignIO.read(cline.outfile, "emboss")
self.assertEqual(len(align), 2)
self.assertEqual(align[0].seq, "ACCCGGGCGCGGT")
self.assertEqual(align[1].seq, "ACCCGAGCGCGGT")
# Clean up,
os.remove(cline.outfile)
def test_water_piped(self):
"""Run water with asis trick, output piped to stdout."""
cline = WaterCommandline(
cmd=exes["water"],
asequence="asis:ACCCGGGCGCGGT",
bsequence="asis:ACCCGAGCGCGGT",
gapopen=10,
gapextend=0.5,
auto=True,
filter=True,
)
self.assertEqual(
str(cline),
exes["water"]
+ " -auto -filter"
+ " -asequence=asis:ACCCGGGCGCGGT"
+ " -bsequence=asis:ACCCGAGCGCGGT"
+ " -gapopen=10 -gapextend=0.5",
)
# Run the tool,
child = subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
child.stdin.close()
# Check we could read its output
align = AlignIO.read(child.stdout, "emboss")
self.assertEqual(len(align), 2)
self.assertEqual(align[0].seq, "ACCCGGGCGCGGT")
self.assertEqual(align[1].seq, "ACCCGAGCGCGGT")
# Check no error output:
self.assertEqual(child.stderr.read(), "")
self.assertEqual(0, child.wait())
child.stdout.close()
child.stderr.close()
def test_needle_file(self):
"""Run needle with the asis trick, output to a file."""
# Setup,
cline = NeedleCommandline(cmd=exes["needle"])
cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT")
cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
cline.set_parameter("-gapopen", "10")
cline.set_parameter("-gapextend", "0.5")
# EMBOSS would guess this, but let's be explicit:
cline.set_parameter("-snucleotide", "True")
cline.set_parameter("-outfile", "Emboss/temp with space.needle")
self.assertEqual(str(eval(repr(cline))), str(cline))
# Run the tool,
stdout, stderr = cline()
# Check it worked,
self.assertTrue(
stderr.strip().startswith("Needleman-Wunsch global alignment"), stderr
)
self.assertEqual(stdout.strip(), "")
filename = cline.outfile
self.assertTrue(
os.path.isfile(filename),
f"Missing output file {filename!r} from:\n{cline}",
)
# Check we can parse the output...
align = AlignIO.read(filename, "emboss")
self.assertEqual(len(align), 2)
self.assertEqual(align[0].seq, "ACCCGGGCGCGGT")
self.assertEqual(align[1].seq, "ACCCGAGCGCGGT")
# Clean up,
os.remove(filename)
def test_needle_piped(self):
"""Run needle with asis trick, output piped to stdout."""
cline = NeedleCommandline(
cmd=exes["needle"],
asequence="asis:ACCCGGGCGCGGT",
bsequence="asis:ACCCGAGCGCGGT",
gapopen=10,
gapextend=0.5,
auto=True,
filter=True,
)
self.assertEqual(
str(cline),
exes["needle"]
+ " -auto -filter"
+ " -asequence=asis:ACCCGGGCGCGGT"
+ " -bsequence=asis:ACCCGAGCGCGGT"
+ " -gapopen=10 -gapextend=0.5",
)
# Run the tool,
child = subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
child.stdin.close()
# Check we could read its output
align = AlignIO.read(child.stdout, "emboss")
self.assertEqual(len(align), 2)
self.assertEqual(align[0].seq, "ACCCGGGCGCGGT")
self.assertEqual(align[1].seq, "ACCCGAGCGCGGT")
# Check no error output:
self.assertEqual(child.stderr.read(), "")
self.assertEqual(0, child.wait())
child.stdout.close()
child.stderr.close()
def test_water_file2(self):
"""Run water with the asis trick and nucleotide FASTA file, output to a file."""
# Setup,
query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGCAGAGGGACGTTTGAGTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTTATG"
out_file = "Emboss/temp_test2.water"
in_file = "Fasta/f002"
self.assertTrue(os.path.isfile(in_file))
if os.path.isfile(out_file):
os.remove(out_file)
cline = WaterCommandline(cmd=exes["water"])
cline.set_parameter("-asequence", f"asis:{query}")
cline.set_parameter("-bsequence", in_file)
cline.set_parameter("-gapopen", "10")
cline.set_parameter("-gapextend", "0.5")
cline.set_parameter("-outfile", out_file)
self.assertEqual(str(eval(repr(cline))), str(cline))
# Run the tool,
self.run_water(cline)
# Check we can parse the output and it is sensible...
self.pairwise_alignment_check(
query,
SeqIO.parse(in_file, "fasta"),
AlignIO.parse(out_file, "emboss"),
local=True,
)
# Clean up,
os.remove(out_file)
def test_water_file3(self):
"""Run water with the asis trick and GenBank file, output to a file."""
# Setup,
query = "TGTTGTAATGTTTTAATGTTTCTTCTCCCTTTAGATGTACTACGTTTGGA"
out_file = "Emboss/temp_test3.water"
in_file = "GenBank/cor6_6.gb"
self.assertTrue(os.path.isfile(in_file))
if os.path.isfile(out_file):
os.remove(out_file)
cline = WaterCommandline(cmd=exes["water"])
cline.set_parameter("asequence", f"asis:{query}")
cline.set_parameter("bsequence", in_file)
# TODO - Tell water this is a GenBank file!
cline.set_parameter("gapopen", "1")
cline.set_parameter("gapextend", "0.5")
cline.set_parameter("outfile", out_file)
self.assertEqual(str(eval(repr(cline))), str(cline))
# Run the tool,
self.run_water(cline)
# Check we can parse the output and it is sensible...
self.pairwise_alignment_check(
query,
SeqIO.parse(in_file, "genbank"),
AlignIO.parse(out_file, "emboss"),
local=True,
)
# Clean up,
os.remove(out_file)
def test_water_file4(self):
"""Run water with the asis trick and SwissProt file, output to a file."""
# Setup,
query = "DVCTGKALCDPVTQNIKTYPVKIENLRVMI"
out_file = "Emboss/temp_test4.water"
in_file = "SwissProt/P0A186.txt"
self.assertTrue(os.path.isfile(in_file))
if os.path.isfile(out_file):
os.remove(out_file)
cline = WaterCommandline(cmd=exes["water"])
cline.set_parameter("-asequence", f"asis:{query}")
cline.set_parameter("-bsequence", in_file)
# EMBOSS should work this out, but let's be explicit:
cline.set_parameter("-sprotein", True)
# TODO - Tell water this is a SwissProt file!
cline.set_parameter("-gapopen", "20")
cline.set_parameter("-gapextend", "5")
cline.set_parameter("-outfile", out_file)
self.assertEqual(str(eval(repr(cline))), str(cline))
# Run the tool,
self.run_water(cline)
# Check we can parse the output and it is sensible...
self.pairwise_alignment_check(
query,
SeqIO.parse(in_file, "swiss"),
AlignIO.parse(out_file, "emboss"),
local=True,
)
# Clean up,
os.remove(out_file)
def test_needle_piped2(self):
"""Run needle with asis trick, and nucleotide FASTA file, output piped to stdout."""
# TODO - Support needle in Bio.Emboss.Applications
# (ideally with the -auto and -filter arguments)
# Setup,
query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAA"
cline = exes["needle"]
cline += " -asequence asis:" + query
cline += " -bsequence Fasta/f002"
cline += " -auto" # no prompting
cline += " -filter" # use stdout
# Run the tool,
child = subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
child.stdin.close()
# Check we can parse the output and it is sensible...
self.pairwise_alignment_check(
query,
SeqIO.parse("Fasta/f002", "fasta"),
AlignIO.parse(child.stdout, "emboss"),
local=False,
)
# Check no error output:
self.assertEqual(child.stderr.read(), "")
self.assertEqual(0, child.wait())
child.stdout.close()
child.stderr.close()
def test_water_needs_output(self):
"""Run water without output file or stdout/filter should give error."""
cline = WaterCommandline(
cmd=exes["water"],
asequence="asis:ACCCGGGCGCGGT",
bsequence="asis:ACCCGAGCGCGGT",
gapopen=10,
gapextend=0.5,
auto=True,
)
self.assertTrue(cline.auto)
self.assertTrue(not cline.stdout)
self.assertTrue(not cline.filter)
self.assertIsNone(cline.outfile)
self.assertRaises(ValueError, str, cline)
def test_needle_needs_output(self):
"""Run needle without output file or stdout/filter should give error."""
cline = NeedleCommandline(
cmd=exes["needle"],
asequence="asis:ACCCGGGCGCGGT",
bsequence="asis:ACCCGAGCGCGGT",
gapopen=10,
gapextend=0.5,
auto=True,
)
self.assertTrue(cline.auto)
self.assertTrue(not cline.stdout)
self.assertTrue(not cline.filter)
self.assertIsNone(cline.outfile)
self.assertRaises(ValueError, str, cline)
def test_seqtmatchall_piped(self):
"""Run seqmatchall with pair output piped to stdout."""
cline = SeqmatchallCommandline(
cmd=exes["seqmatchall"],
sequence="Fasta/f002",
aformat="pair",
wordsize=9,
auto=True,
stdout=True,
)
self.assertEqual(
str(cline),
exes["seqmatchall"]
+ " -auto -stdout"
+ " -sequence=Fasta/f002"
+ " -wordsize=9 -aformat=pair",
)
# Run the tool,
child = subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
child.stdin.close()
# Check we could read its output
for align in AlignIO.parse(child.stdout, "emboss"):
self.assertEqual(len(align), 2)
self.assertEqual(align.get_alignment_length(), 9)
# Check no error output:
self.assertEqual(child.stderr.read(), "")
self.assertEqual(0, child.wait())
child.stdout.close()
child.stderr.close()
class TranslationTests(unittest.TestCase):
"""Run pairwise alignments with water and needle, and parse them."""
def tearDown(self):
clean_up()
def test_simple(self):
"""Run transeq vs Bio.Seq for simple translations (including alt tables)."""
examples = [
Seq("ACGTGACTGACGTAGCATGCCACTAGG"),
# Unamibguous TA? codons:
Seq("TAATACTATTAG"),
# Most of the ambiguous TA? codons:
Seq("TANTARTAYTAMTAKTAHTABTADTAV"),
# Problem cases,
#
# Seq("TAW"),
# W = A or T, but EMBOSS does TAW -> X
# TAA -> Y, TAT ->Y, so in Biopython TAW -> Y
#
# Seq("TAS"),
# S = C or G, but EMBOSS does TAS -> Y
# TAG -> *, TAC ->Y, so in Biopython TAS -> X (Y or *)
#
# Seq("AAS"),
# On table 9, EMBOSS gives N, we give X.
# S = C or G, so according to my reading of
# table 9 on the NCBI page, AAC=N, AAG=K
# suggesting this is a bug in EMBOSS.
#
Seq("ACGGGGGGGGTAAGTGGTGTGTGTGTAGT"),
]
for sequence in examples:
# EMBOSS treats spare residues differently... avoid this issue
if len(sequence) % 3 != 0:
sequence = sequence[: -(len(sequence) % 3)]
self.assertEqual(len(sequence) % 3, 0)
self.assertGreater(len(sequence), 0)
self.check(sequence)
def check_emboss_translate(self, sequence, table=None, frame=None):
"""Call transeq, returns protein sequence as string."""
# TODO - Support transeq in Bio.Emboss.Applications?
# (doesn't seem worthwhile as Biopython can do translations)
# Setup,
cline = exes["transeq"]
if len(sequence) < 100:
filename = None
cline += f" -sequence asis:{sequence}"
else:
# There are limits on command line string lengths...
# use a temp file instead.
filename = "Emboss/temp_transeq.txt"
SeqIO.write(SeqRecord(sequence, id="Test"), filename, "fasta")
cline += f" -sequence {filename}"
cline += " -auto" # no prompting
cline += " -filter" # use stdout
if table is not None:
cline += f" -table {table!s}"
if frame is not None:
cline += f" -frame {frame!s}"
# Run the tool,
child = subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
out, err = child.communicate()
msg = f"cline='{cline}'"
# Check no error output:
self.assertEqual(err, "", msg=msg)
# Check we could read its output
record = SeqIO.read(StringIO(out), "fasta")
result = child.wait()
self.assertEqual(result, 0, msg=msg)
if filename:
os.remove(filename)
self.assertTrue(record.id.startswith("Test"), msg=msg)
else:
self.assertTrue(record.id.startswith("asis"), msg=msg)
translation = record.seq
if table is None:
table = 1
self.assertEqual(translation, sequence.translate(table))
self.assertEqual(translation, translate(sequence, table))
self.assertEqual(translation, translate(str(sequence), table))
# More details...
for i, amino in enumerate(translation):
codon = sequence[i * 3 : i * 3 + 3]
msg = f"codon {codon}, table {table}"
self.assertEqual(amino, codon.translate(table), msg=msg)
def check(self, sequence):
"""Compare our translation to EMBOSS's using all tables.
Takes a Seq object (and a filename containing it).
"""
self.check_emboss_translate(sequence)
for table in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23]:
self.check_emboss_translate(sequence, table)
def translate_all_codons(self, letters):
sequence = Seq(
"".join(c1 + c3 + c3 for c1 in letters for c2 in letters for c3 in letters)
)
self.check(sequence)
# def test_all_ambig_dna_codons(self):
# """transeq vs Bio.Seq on ambiguous DNA codons (inc. alt tables)."""
# self.translate_all_codons(ambiguous_dna_letters)
def test_all_unambig_dna_codons(self):
"""Run transeq vs Bio.Seq on unambiguous DNA codons (inc. alt tables)."""
self.translate_all_codons("ATCGatcg")
def test_all_unambig_rna_codons(self):
"""Run transeq vs Bio.Seq on unambiguous RNA codons (inc. alt tables)."""
self.translate_all_codons("AUCGaucg")
def test_mixed_unambig_rna_codons(self):
"""Run transeq vs Bio.Seq on unambiguous DNA/RNA codons (inc. alt tables)."""
self.translate_all_codons("ATUCGatucg")
def clean_up():
"""Fallback clean up method to remove temp files."""
for filename in os.listdir("Emboss"):
if filename.startswith("temp_"):
try:
os.remove(filename)
except Exception: # TODO - Which exceptions?
pass
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)
clean_up()

View File

@ -1,360 +0,0 @@
# Copyright 2009 by David Winter. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for EmbossPhylipNew module."""
import os
import sys
import unittest
import warnings
from Bio import AlignIO
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio.Nexus import Trees # One day we should use planned TreeIO module
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Emboss.Applications import FConsenseCommandline
from Bio.Emboss.Applications import FDNADistCommandline
from Bio.Emboss.Applications import FDNAParsCommandline
from Bio.Emboss.Applications import FNeighborCommandline
from Bio.Emboss.Applications import FProtDistCommandline
from Bio.Emboss.Applications import FProtParsCommandline
from Bio.Emboss.Applications import FSeqBootCommandline
from Bio.Emboss.Applications import FTreeDistCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
exes_wanted = [
"fdnadist",
"fneighbor",
"fprotdist",
"fprotpars",
"fconsense",
"fseqboot",
"ftreedist",
"fdnapars",
]
exes = {} # Dictionary mapping from names to exe locations
if "EMBOSS_ROOT" in os.environ:
# Windows default installation path is C:\mEMBOSS which contains the exes.
# EMBOSS also sets an environment variable which we will check for.
path = os.environ["EMBOSS_ROOT"]
if os.path.isdir(path):
for name in exes_wanted:
if os.path.isfile(os.path.join(path, name + ".exe")):
exes[name] = os.path.join(path, name + ".exe")
del path, name
if sys.platform != "win32":
from subprocess import getoutput
for name in exes_wanted:
# This will "just work" if installed on the path as normal on Unix
output = getoutput(f"{name} -help")
if "not found" not in output and "not recognized" not in output:
exes[name] = name
del output
del name
if len(exes) < len(exes_wanted):
raise MissingExternalDependencyError(
"Install the Emboss package 'PhylipNew' if you want to use the "
"Bio.Emboss.Applications wrappers for phylogenetic tools."
)
# #########################################################################
# A few top level functions that are called repeatedly in the test cases
def clean_up():
"""Delete tests files (to be used as tearDown() function in test fixtures)."""
for filename in ["test_file", "Phylip/opuntia.phy", "Phylip/hedgehog.phy"]:
if os.path.isfile(filename):
os.remove(filename)
def parse_trees(filename):
"""Parse trees.
Helper function until we have Bio.Phylo on trunk.
"""
# TODO - Can this be removed now?
with open("test_file") as handle:
data = handle.read()
for tree_str in data.split(";\n"):
if tree_str:
yield Trees.Tree(tree_str + ";")
class DistanceTests(unittest.TestCase):
"""Tests for calculating distance based phylogenetic trees with phylip."""
def tearDown(self):
clean_up()
test_taxa = [
"Archaeohip",
"Calippus",
"Hypohippus",
"M._secundu",
"Merychippu",
"Mesohippus",
"Nannipus",
"Neohippari",
"Parahippus",
"Pliohippus",
]
def distances_from_alignment(self, filename, DNA=True):
"""Check we can make a distance matrix from a given alignment."""
self.assertTrue(os.path.isfile(filename), f"Missing {filename}")
if DNA:
cline = FDNADistCommandline(
exes["fdnadist"],
method="j",
sequence=filename,
outfile="test_file",
auto=True,
)
else:
cline = FProtDistCommandline(
exes["fprotdist"],
method="j",
sequence=filename,
outfile="test_file",
auto=True,
)
stdout, strerr = cline()
# biopython can't grok distance matrices, so we'll just check it exists
self.assertTrue(os.path.isfile("test_file"))
def tree_from_distances(self, filename):
"""Check we can estimate a tree from a distance matrix."""
self.assertTrue(os.path.isfile(filename), f"Missing {filename}")
cline = FNeighborCommandline(
exes["fneighbor"],
datafile=filename,
outtreefile="test_file",
auto=True,
filter=True,
)
stdout, stderr = cline()
for tree in parse_trees("test_file"):
tree_taxa = [t.replace(" ", "_") for t in tree.get_taxa()]
self.assertEqual(self.test_taxa, sorted(tree_taxa))
def test_distances_from_phylip_DNA(self):
"""Calculate a distance matrix from an phylip alignment."""
self.distances_from_alignment("Phylip/horses.phy")
def test_distances_from_AlignIO_DNA(self):
"""Calculate a distance matrix from an alignment written by AlignIO."""
n = AlignIO.convert(
"Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip"
)
self.assertEqual(n, 1)
self.distances_from_alignment("Phylip/opuntia.phy")
# def test_distances_from_bootstrapped_phylip_DNA(self):
# """Calculate a set of distance matrices from phylip alignments"""
# self.distances_from_alignment("Phylip/bs_horses.phy")
# fprotdist tests
def test_distances_from_protein_phylip(self):
"""Calculate a distance matrix from phylip protein alignment."""
self.distances_from_alignment("Phylip/interlaced.phy", DNA=False)
def test_distances_from_protein_AlignIO(self):
"""Calculate distance matrix from an AlignIO written protein alignment."""
n = AlignIO.convert(
"Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip"
)
self.assertEqual(n, 1)
self.distances_from_alignment("Phylip/hedgehog.phy", DNA=False)
# def test_distances_from_bootstrapped_phylip_protein(self):
# """Calculate distance matrices from a bootstrapped protein alignment"""
# self.distances_from_alignment("Clustalw/bs_interlaced.phy", DNA=False)
# fneighbor tests
# def test_tree_from_distances(self):
# """Estimate tree from distance matrix and parse it."""
# self.tree_from_distances("Phylip/horses.fdnadist")
# This one won't work because of a bug in EMBOSS 6.0.1
# def test_tree_from_bootstrapped_distances(self):
# """Estimate tree from bootstrapped distance matrix and parse it"""
# self.tree_from_distances("Phylip/bs_horses.fdnadist")
class ParsimonyTests(unittest.TestCase):
"""Tests for estimating parsimony based phylogenetic trees with phylip."""
def tearDown(self):
clean_up()
def parsimony_tree(self, filename, format, DNA=True):
"""Estimate a parsimony tree from an alignment."""
self.assertTrue(os.path.isfile(filename), f"Missing {filename}")
if DNA:
cline = FDNAParsCommandline(
exes["fdnapars"],
sequence=filename,
outtreefile="test_file",
auto=True,
stdout=True,
)
else:
cline = FProtParsCommandline(
exes["fprotpars"],
sequence=filename,
outtreefile="test_file",
auto=True,
stdout=True,
)
stdout, stderr = cline()
with open(filename) as handle:
a_taxa = [
s.name.replace(" ", "_") for s in next(AlignIO.parse(handle, format))
]
for tree in parse_trees("test_file"):
t_taxa = [t.replace(" ", "_") for t in tree.get_taxa()]
self.assertEqual(sorted(a_taxa), sorted(t_taxa))
# fdnapars tests
# def test_parsimony_tree_from_phylip_DNA(self):
# """Make a parsimony tree from a phylip DNA alignment"""
# self.parsimony_tree("Phylip/horses.phy", "phylip")
def test_parsimony_tree_from_AlignIO_DNA(self):
"""Make a parsimony tree from an alignment written with AlignIO."""
n = AlignIO.convert(
"Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip"
)
self.assertEqual(n, 1)
self.parsimony_tree("Phylip/opuntia.phy", "phylip")
# def test_parsimony_bootstrapped_phylip_DNA(self):
# """Make a parsimony tree from a bootstrapped phylip DNA alignment"""
# self.parsimony_tree("Phylip/bs_horses.phy", "phylip")
# fprotpars tests
# def test_parsimony_tree_from_phylip_protein(self):
# """Make a parsimony tree from a phylip DNA alignment"""
# self.parsimony_tree("Phylip/interlaced.phy", "phylip", DNA=False)
def test_parsimony_from_AlignIO_protein(self):
"""Make a parsimony tree from protein alignment written with AlignIO."""
n = AlignIO.convert(
"Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip"
)
self.parsimony_tree("Phylip/interlaced.phy", "phylip", DNA=False)
# def test_parsimony_tree_bootstrapped_phylip_protein(self):
# """Make a parsimony tree from a phylip DNA alignment"""
# self.parsimony_tree("Phylip/bs_interlaced.phy", "phylip", DNA=False)
class BootstrapTests(unittest.TestCase):
"""Tests for pseudosampling alignments with fseqboot."""
def tearDown(self):
clean_up()
def check_bootstrap(self, filename, format, align_type="d"):
"""Check we can use fseqboot to pseudosample an alignment.
The align_type type argument is passed to the commandline object to
set the output format to use (from [D]na,[p]rotein and [r]na )
"""
self.assertTrue(os.path.isfile(filename), f"Missing {filename}")
cline = FSeqBootCommandline(
exes["fseqboot"],
sequence=filename,
outfile="test_file",
seqtype=align_type,
reps=2,
auto=True,
filter=True,
)
stdout, stderr = cline()
# the resultant file should have 2 alignments...
with open("test_file") as handle:
bs = list(AlignIO.parse(handle, format))
self.assertEqual(len(bs), 2)
# ..and each name in the original alignment...
with open(filename) as handle:
a_names = [s.name.replace(" ", "_") for s in AlignIO.read(handle, format)]
# ...should be in each alignment in the bootstrapped file
for a in bs:
self.assertEqual(a_names, [s.name.replace(" ", "_") for s in a])
def test_bootstrap_phylip_DNA(self):
"""Pseudosample a phylip DNA alignment."""
self.check_bootstrap("Phylip/horses.phy", "phylip")
def test_bootstrap_AlignIO_DNA(self):
"""Pseudosample a phylip DNA alignment written with AlignIO."""
n = AlignIO.convert(
"Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip"
)
self.assertEqual(n, 1)
self.check_bootstrap("Phylip/opuntia.phy", "phylip")
def test_bootstrap_phylip_protein(self):
"""Pseudosample a phylip protein alignment."""
self.check_bootstrap("Phylip/interlaced.phy", "phylip", "p")
def test_bootstrap_AlignIO_protein(self):
"""Pseudosample a phylip protein alignment written with AlignIO."""
n = AlignIO.convert(
"Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip"
)
self.check_bootstrap("Phylip/hedgehog.phy", "phylip", "p")
class TreeComparisonTests(unittest.TestCase):
"""Tests for comparing phylogenetic trees with phylip tools."""
def tearDown(self):
clean_up()
def test_fconsense(self):
"""Calculate a consensus tree with fconsense."""
cline = FConsenseCommandline(
exes["fconsense"],
intreefile="Phylip/horses.tree",
outtreefile="test_file",
auto=True,
filter=True,
)
stdout, stderr = cline()
# Split the next and get_taxa into two steps to help 2to3 work
tree1 = next(parse_trees("test_file"))
taxa1 = tree1.get_taxa()
for tree in parse_trees("Phylip/horses.tree"):
taxa2 = tree.get_taxa()
self.assertEqual(sorted(taxa1), sorted(taxa2))
def test_ftreedist(self):
"""Calculate the distance between trees with ftreedist."""
cline = FTreeDistCommandline(
exes["ftreedist"],
intreefile="Phylip/horses.tree",
outfile="test_file",
auto=True,
filter=True,
)
stdout, stderr = cline()
self.assertTrue(os.path.isfile("test_file"))
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)
clean_up()

View File

@ -1,169 +0,0 @@
# Copyright 2013 by Nate Sutton. All rights reserved.
# Based on test_Clustalw_tool.py by Peter Cock.
# Example code used from Biopython's Phylo cookbook by Eric Talevich.
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for Fasttree tool."""
import itertools
import os
import sys
import unittest
import warnings
from io import StringIO
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import Phylo
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Application import ApplicationError
from Bio.Phylo.Applications import _Fasttree
from Bio.Phylo.Applications import FastTreeCommandline
#################################################################
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
fasttree_exe = None
if sys.platform == "win32":
try:
# This can vary depending on the Windows language.
prog_files = os.environ["PROGRAMFILES"]
except KeyError:
prog_files = r"C:\Program Files (x86)"
# A default fasttree file path of "C:\Program Files (x86)\Fasttree.exe"
# was chosen here but users can alter the path according to where
# fasttree is located on their systems
likely_dirs = ["", "FastTree"]
likely_exes = ["FastTree.exe"]
for folder in likely_dirs:
if os.path.isdir(os.path.join(prog_files, folder)):
for filename in likely_exes:
if os.path.isfile(os.path.join(prog_files, folder, filename)):
fasttree_exe = os.path.join(prog_files, folder, filename)
break
if fasttree_exe:
break
else:
from subprocess import getoutput
# Website uses 'FastTree', Nate's system had 'fasttree'
likely_exes = ["FastTree", "fasttree"]
for filename in likely_exes:
# Checking the -help argument
output = getoutput(f"{filename} -help")
# Since "is not recognized" may be in another language, try and be sure this
# is really the fasttree tool's output
if (
"is not recognized" not in output
and "protein_alignment" in output
and "nucleotide_alignment" in output
):
fasttree_exe = filename
break
if not fasttree_exe:
raise MissingExternalDependencyError(
"Install FastTree and correctly set the file path to the program "
"if you want to use it from Biopython."
)
class FastTreeTestCase(unittest.TestCase):
def check(self, path, length):
input_records = SeqIO.to_dict(SeqIO.parse(path, "fasta"))
self.assertEqual(len(input_records), length)
# Any filenames with spaces should get escaped with quotes
# automatically.
# Using keyword arguments here.
cline = _Fasttree.FastTreeCommandline(fasttree_exe, input=path, nt=True)
self.assertEqual(str(eval(repr(cline))), str(cline))
out, err = cline()
self.assertTrue(err.strip().startswith("FastTree"))
tree = Phylo.read(StringIO(out), "newick")
names = {}
for clade in tree.find_clades():
if clade.name:
self.assertNotIn(clade.name, names)
names[clade.name] = clade
self.assertGreater(len(names), 0)
def terminal_neighbor_dists(self):
"""Return a list of distances between adjacent terminals."""
def generate_pairs(self):
pairs = itertools.tee(self)
next(pairs[1]) # Advance second iterator one step
return zip(pairs[0], pairs[1])
return [
self.distance(*i)
for i in generate_pairs(self.find_clades(terminal=True))
]
for dist in terminal_neighbor_dists(tree):
self.assertGreater(dist, 0.0)
def test_normal(self):
self.check("Quality/example.fasta", 3)
def test_filename_spaces(self):
path = "Clustalw/temp horses.fasta" # note spaces in filename
records = SeqIO.parse("Phylip/hennigian.phy", "phylip")
with open(path, "w") as handle:
length = SeqIO.write(records, handle, "fasta")
self.assertEqual(length, 10)
self.check(path, length)
def test_invalid(self):
path = "Medline/pubmed_result1.txt"
cline = FastTreeCommandline(fasttree_exe, input=path)
with self.assertRaises(ApplicationError) as cm:
stdout, stderr = cline()
message = str(cm.exception)
self.assertTrue(
"invalid format" in message
or "not produced" in message
or "No sequences in file" in message
or "Error parsing header line:" in message
or "Non-zero return code " in message,
msg=f"Unknown ApplicationError raised: {message}",
)
def test_single(self):
path = "Fasta/f001"
records = list(SeqIO.parse(path, "fasta"))
self.assertEqual(len(records), 1)
cline = FastTreeCommandline(fasttree_exe, input=path)
stdout, stderr = cline()
self.assertIn("Unique: 1/1", stderr)
def test_empty(self):
path = "does_not_exist.fasta"
cline = FastTreeCommandline(fasttree_exe, input=path)
with self.assertRaises(ApplicationError) as cm:
stdout, stderr = cline()
message = str(cm.exception)
self.assertTrue(
"Cannot open sequence file" in message
or "Cannot open sequence file" in message
or f"Cannot read {path}" in message
or "Non-zero return code " in message,
msg=f"Unknown ApplicationError raised: {message}",
)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,174 +0,0 @@
# Copyright 2013 by Christian Brueffer. All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for MSAProbs tool."""
import os
import sys
import unittest
import warnings
from subprocess import getoutput
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import MSAProbsCommandline
from Bio.Application import ApplicationError
#################################################################
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
msaprobs_exe = None
try:
output = getoutput("msaprobs -version")
if output.startswith("MSAPROBS version"):
msaprobs_exe = "msaprobs"
except FileNotFoundError:
pass
if not msaprobs_exe:
raise MissingExternalDependencyError(
"Install msaprobs if you want to use MSAProbs from Biopython."
)
class MSAProbsTestCase(unittest.TestCase):
def setUp(self):
self.files_to_clean = set()
def tearDown(self):
for filename in self.files_to_clean:
if os.path.isfile(filename):
os.remove(filename)
def standard_test_procedure(self, cline):
"""Shared testing procedure used by all tests."""
# Mark output files for later cleanup.
self.add_file_to_clean(cline.outfile)
input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
self.assertEqual(str(eval(repr(cline))), str(cline))
output, error = cline()
def add_file_to_clean(self, filename):
"""Add a file for deferred removal by the tearDown routine."""
self.files_to_clean.add(filename)
#################################################################
class MSAProbsTestErrorConditions(MSAProbsTestCase):
def test_empty_file(self):
"""Test an empty file."""
input_file = "does_not_exist.fasta"
self.assertFalse(os.path.isfile(input_file))
cline = MSAProbsCommandline(msaprobs_exe, infile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
self.assertTrue(
"Cannot open sequence file" in str(err)
or "Cannot open input file" in str(err)
or "Non-zero return code " in str(err),
str(err),
)
else:
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
def test_single_sequence(self):
"""Test an input file containing a single sequence."""
input_file = "Fasta/f001"
self.assertTrue(os.path.isfile(input_file))
self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1)
cline = MSAProbsCommandline(msaprobs_exe, infile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
if sys.platform == "win32":
expected = 0xC0000005
elif sys.platform == "darwin":
expected = -11
else:
expected = 139 # TODO: Check return codes on various other platforms
self.assertEqual(expected, err.returncode)
else:
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
def test_invalid_format(self):
"""Test an input file in an invalid format."""
input_file = "Medline/pubmed_result1.txt"
self.assertTrue(os.path.isfile(input_file))
cline = MSAProbsCommandline(msaprobs_exe, infile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
self.assertEqual(err.returncode, 1)
else:
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
#################################################################
class MSAProbsTestNormalConditions(MSAProbsTestCase):
def test_simple_fasta(self):
"""Test a simple fasta file."""
input_file = "Registry/seqs.fasta"
output_file = "temp_test.aln"
cline = MSAProbsCommandline(
msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True
)
self.standard_test_procedure(cline)
def test_properties(self):
"""Test setting options via properties."""
input_file = "Registry/seqs.fasta"
output_file = "temp_test.aln"
cline = MSAProbsCommandline(msaprobs_exe)
cline.infile = input_file
cline.outfile = output_file
cline.clustalw = True
self.standard_test_procedure(cline)
def test_input_filename_with_space(self):
"""Test an input filename containing a space."""
input_file = "Clustalw/temp horses.fasta"
with open(input_file, "w") as handle:
SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta")
output_file = "temp_test.aln"
cline = MSAProbsCommandline(
msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True
)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
def test_output_filename_with_spaces(self):
"""Test an output filename containing spaces."""
input_file = "Registry/seqs.fasta"
output_file = "temp with spaces.aln"
cline = MSAProbsCommandline(
msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True
)
self.standard_test_procedure(cline)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,191 +0,0 @@
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unittests for Bio.Align.Applications interface for MAFFT."""
import os
import subprocess
import sys
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import MafftCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
mafft_exe = None
if sys.platform == "win32":
raise MissingExternalDependencyError(
"Testing with MAFFT not implemented on Windows yet"
)
else:
from subprocess import getoutput
output = getoutput("mafft -help")
if "not found" not in output and "not recognized" not in output:
if "MAFFT" in output:
mafft_exe = "mafft"
if not mafft_exe:
raise MissingExternalDependencyError(
"Install MAFFT if you want to use the Bio.Align.Applications wrapper."
)
def check_mafft_version(mafft_exe):
child = subprocess.Popen(
f"{mafft_exe} --help",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
output = stdoutdata + "\n" + stderrdata
return_code = child.returncode
del child
if (
"correctly installed?" in output
or "mafft binaries have to be installed" in output
):
raise MissingExternalDependencyError(
"MAFFT does not seem to be correctly installed."
)
# e.g. "MAFFT version 5.732 (2005/09/14)\n"
# e.g. " MAFFT v6.717b (2009/12/03)\n"
for marker in ["MAFFT version", "MAFFT v"]:
index = output.find(marker)
if index == -1:
continue
version = output[index + len(marker) :].strip().split(None, 1)[0]
major = int(version.split(".", 1)[0])
if major < 6:
raise MissingExternalDependencyError(
f"Test requires MAFFT v6 or later (found {version})."
)
return (major, version)
raise MissingExternalDependencyError("Couldn't determine MAFFT version.")
# This also checks it actually runs!
version_major, version_string = check_mafft_version(mafft_exe)
class MafftApplication(unittest.TestCase):
def setUp(self):
self.infile1 = "Fasta/f002"
def tearDown(self):
if os.path.isfile("Fasta/f002.tree"):
os.remove("Fasta/f002.tree")
def test_Mafft_simple(self):
"""Simple round-trip through app with infile, result passed to stdout."""
# Use a keyword argument at init,
cmdline = MafftCommandline(mafft_exe, input=self.infile1)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdoutdata, stderrdata = cmdline()
self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
# Used to get "Progressive alignment ..." but in v7.245
# became "Progressive alignment 1/2..." and "Progressive alignment 2/2..."
self.assertTrue(
("Progressive alignment ..." in stderrdata)
or ("Progressive alignment 1/" in stderrdata),
stderrdata,
)
self.assertNotIn("$#=0", stderrdata)
def test_Mafft_with_options(self):
"""Simple round-trip through app with infile and options, result passed to stdout."""
cmdline = MafftCommandline(mafft_exe)
cmdline.set_parameter("input", self.infile1)
cmdline.set_parameter("maxiterate", 100)
cmdline.set_parameter("--localpair", True)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdoutdata, stderrdata = cmdline()
self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
self.assertNotIn("$#=0", stderrdata)
def test_Mafft_with_Clustalw_output(self):
"""Simple round-trip through app with clustal output."""
cmdline = MafftCommandline(mafft_exe)
# Use some properties:
cmdline.input = self.infile1
cmdline.clustalout = True
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdoutdata, stderrdata = cmdline()
# e.g. "CLUSTAL format alignment by MAFFT ..."
# or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)"
self.assertTrue(stdoutdata.startswith("CLUSTAL"), stdoutdata)
self.assertNotIn("$#=0", stderrdata)
if version_major >= 7:
def test_Mafft_with_PHYLIP_output(self):
"""Simple round-trip through app with PHYLIP output."""
cmdline = MafftCommandline(mafft_exe, input=self.infile1, phylipout=True)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdoutdata, stderrdata = cmdline()
# e.g. " 3 706\n" or " 3 681" but allow some variation in the column count
self.assertTrue(
stdoutdata.startswith((" 3 68", " 3 69", " 3 70")),
stdoutdata,
)
self.assertIn("gi|1348912 ", stdoutdata, stdoutdata)
self.assertNotIn("gi|1348912|gb|G26680|G26680", stdoutdata, stdoutdata)
self.assertNotIn("$#=0", stderrdata)
def test_Mafft_with_PHYLIP_namelength(self):
"""Check PHYLIP with --namelength."""
cmdline = MafftCommandline(
mafft_exe, input=self.infile1, phylipout=True, namelength=50
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdoutdata, stderrdata = cmdline()
# e.g. " 3 706\n" or " 3 681" but allow some variation in the column count
self.assertTrue(
stdoutdata.startswith((" 3 68", " 3 69", " 3 70")),
stdoutdata,
)
self.assertIn("gi|1348912|gb|G26680|G26680", stdoutdata, stdoutdata)
self.assertNotIn("$#=0", stderrdata)
def test_Mafft_with_complex_command_line(self):
"""Round-trip with complex command line."""
cmdline = MafftCommandline(mafft_exe)
cmdline.set_parameter("input", self.infile1)
cmdline.set_parameter("--localpair", True)
cmdline.set_parameter("--weighti", 4.2)
cmdline.set_parameter("retree", 5)
cmdline.set_parameter("maxiterate", 200)
cmdline.set_parameter("--nofft", True)
cmdline.set_parameter("op", 2.04)
cmdline.set_parameter("--ep", 0.51)
cmdline.set_parameter("--lop", 0.233)
cmdline.set_parameter("lep", 0.2)
cmdline.set_parameter("--reorder", True)
cmdline.set_parameter("--treeout", True)
cmdline.set_parameter("nuc", True)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
self.assertEqual(
str(cmdline),
mafft_exe
+ " --localpair --weighti 4.2 --retree 5 "
+ "--maxiterate 200 --nofft --op 2.04 --ep 0.51"
+ " --lop 0.233 --lep 0.2 --reorder --treeout"
+ " --nuc Fasta/f002",
)
stdoutdata, stderrdata = cmdline()
self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
self.assertNotIn("$#=0", stderrdata)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,446 +0,0 @@
# Copyright 2009-2013 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for Muscle tool."""
import os
import subprocess
import sys
import unittest
import warnings
from Bio import AlignIO
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import MuscleCommandline
from Bio.Application import _escape_filename
#################################################################
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
muscle_exe = None
if sys.platform == "win32":
try:
# This can vary depending on the Windows language.
prog_files = os.environ["PROGRAMFILES"]
except KeyError:
prog_files = r"C:\Program Files"
# For Windows, MUSCLE just comes as a zip file which contains the
# a Muscle directory with the muscle.exe file plus a readme etc,
# which the user could put anywhere. We'll try a few sensible
# locations under Program Files... and then the full path.
likely_dirs = [
"", # Current dir
prog_files,
os.path.join(prog_files, "Muscle3.6"),
os.path.join(prog_files, "Muscle3.7"),
os.path.join(prog_files, "Muscle3.8"),
os.path.join(prog_files, "Muscle3.9"),
os.path.join(prog_files, "Muscle"),
] + sys.path
for folder in likely_dirs:
if os.path.isdir(folder):
if os.path.isfile(os.path.join(folder, "muscle.exe")):
muscle_exe = os.path.join(folder, "muscle.exe")
break
if muscle_exe:
break
else:
from subprocess import getoutput
output = getoutput("muscle -version")
# Since "not found" may be in another language, try and be sure this is
# really the MUSCLE tool's output
if "not found" not in output and "not recognized" not in output:
if "MUSCLE" in output and "Edgar" in output:
muscle_exe = "muscle"
if not muscle_exe:
raise MissingExternalDependencyError(
"Install MUSCLE if you want to use the Bio.Align.Applications wrapper."
)
#################################################################
class MuscleApplication(unittest.TestCase):
def setUp(self):
self.infile1 = "Fasta/f002"
self.infile2 = "Fasta/fa01"
self.infile3 = "Fasta/f001"
self.outfile1 = "Fasta/temp align out1.fa" # with spaces!
self.outfile2 = "Fasta/temp_align_out2.fa"
self.outfile3 = "Fasta/temp_align_out3.fa"
self.outfile4 = "Fasta/temp_align_out4.fa"
def tearDown(self):
if os.path.isfile(self.outfile1):
os.remove(self.outfile1)
if os.path.isfile(self.outfile2):
os.remove(self.outfile2)
if os.path.isfile(self.outfile3):
os.remove(self.outfile3)
if os.path.isfile(self.outfile4):
os.remove(self.outfile4)
def test_Muscle_simple(self):
"""Simple round-trip through app just infile and outfile."""
cmdline = MuscleCommandline(muscle_exe, input=self.infile1, out=self.outfile1)
self.assertEqual(
str(cmdline),
_escape_filename(muscle_exe)
+ ' -in Fasta/f002 -out "Fasta/temp align out1.fa"',
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
output, error = cmdline()
self.assertEqual(output, "")
self.assertNotIn("ERROR", error)
def test_Muscle_with_options(self):
"""Round-trip through app with a switch and valued option."""
cmdline = MuscleCommandline(muscle_exe)
cmdline.set_parameter("input", self.infile1) # "input" is alias for "in"
cmdline.set_parameter("out", self.outfile2)
# Use property:
cmdline.objscore = "sp"
cmdline.noanchors = True
self.assertEqual(
str(cmdline),
_escape_filename(muscle_exe)
+ " -in Fasta/f002 -out Fasta/temp_align_out2.fa -objscore sp -noanchors",
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
output, error = cmdline()
self.assertEqual(output, "")
self.assertNotIn("ERROR", error)
self.assertTrue(error.strip().startswith("MUSCLE"), output)
def test_Muscle_profile_simple(self):
"""Simple round-trip through app doing a profile alignment."""
cmdline = MuscleCommandline(muscle_exe)
cmdline.set_parameter("out", self.outfile3)
cmdline.set_parameter("profile", True)
cmdline.set_parameter("in1", self.infile2)
cmdline.set_parameter("in2", self.infile3)
self.assertEqual(
str(cmdline),
_escape_filename(muscle_exe)
+ " -out Fasta/temp_align_out3.fa -profile -in1 Fasta/fa01 -in2 Fasta/f001",
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
output, error = cmdline()
self.assertEqual(output, "")
self.assertNotIn("ERROR", error)
self.assertTrue(error.strip().startswith("MUSCLE"), output)
def test_Muscle_profile_with_options(self):
"""Profile alignment, and switch and valued options."""
# Using some keyword arguments, note -stable isn't supported in v3.8
cmdline = MuscleCommandline(
muscle_exe,
out=self.outfile4,
in1=self.infile2,
in2=self.infile3,
profile=True,
stable=True,
cluster1="neighborjoining",
)
self.assertEqual(
str(cmdline),
_escape_filename(muscle_exe)
+ " -out Fasta/temp_align_out4.fa -profile -in1 Fasta/fa01 -in2 Fasta/f001"
+ " -cluster1 neighborjoining -stable",
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
"""
#TODO - Why doesn't this work with MUSCLE 3.6 on the Mac?
#It may be another bug fixed in MUSCLE 3.7 ...
result, stdout, stderr = generic_run(cmdline)
#NOTE: generic_run has been removed from Biopython
self.assertEqual(result.return_code, 0)
self.assertEqual(stdout.read(), "")
self.assertNotIn("ERROR", stderr.read())
self.assertEqual(str(result._cl), str(cmdline))
"""
class SimpleAlignTest(unittest.TestCase):
"""Simple MUSCLE tests."""
"""
#FASTA output seems broken on Muscle 3.6 (on the Mac).
def test_simple_fasta(self):
input_file = "Fasta/f002"
self.assertTrue(os.path.isfile(input_file))
records = list(SeqIO.parse(input_file,"fasta"))
#Prepare the command...
cmdline = MuscleCommandline(muscle_exe)
cmdline.set_parameter("in", input_file)
#Preserve input record order (makes checking output easier)
cmdline.set_parameter("stable")
#Set some others options just to test them
cmdline.set_parameter("maxiters", 2)
self.assertEqual(str(cmdline).rstrip(), "muscle -in Fasta/f002 -maxiters 2 -stable")
result, out_handle, err_handle = generic_run(cmdline)
#NOTE: generic_run has been removed from Biopython
print(err_handle.read())
print(out_handle.read())
align = AlignIO.read(out_handle, "fasta")
self.assertEqual(len(records),len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(str(new.seq).replace("-",""), old.seq)
"""
def test_simple_msf(self):
"""Simple muscle call using MSF output."""
input_file = "Fasta/f002"
self.assertTrue(os.path.isfile(input_file))
records = list(SeqIO.parse(input_file, "fasta"))
records.sort(key=lambda rec: rec.id) # noqa: E731
cmdline = MuscleCommandline(muscle_exe, input=input_file, msf=True)
self.assertEqual(
str(cmdline).rstrip(), _escape_filename(muscle_exe) + " -in Fasta/f002 -msf"
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
child = subprocess.Popen(
str(cmdline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
# Didn't use -quiet so there should be progress reports on stderr,
align = AlignIO.read(child.stdout, "msf")
align.sort() # by record.id
self.assertTrue(child.stderr.read().strip().startswith("MUSCLE"))
return_code = child.wait()
self.assertEqual(return_code, 0)
child.stdout.close()
child.stderr.close()
del child
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(str(new.seq).replace("-", ""), old.seq)
def test_simple_clustal(self):
"""Simple muscle call using Clustal output with a MUSCLE header."""
input_file = "Fasta/f002"
self.assertTrue(os.path.isfile(input_file))
records = list(SeqIO.parse(input_file, "fasta"))
records.sort(key=lambda rec: rec.id) # noqa: E731
# Prepare the command... use Clustal output (with a MUSCLE header)
cmdline = MuscleCommandline(muscle_exe, input=input_file, clw=True)
self.assertEqual(
str(cmdline).rstrip(), _escape_filename(muscle_exe) + " -in Fasta/f002 -clw"
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
child = subprocess.Popen(
str(cmdline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
# Didn't use -quiet so there should be progress reports on stderr,
align = AlignIO.read(child.stdout, "clustal")
align.sort() # by record.id
self.assertTrue(child.stderr.read().strip().startswith("MUSCLE"))
return_code = child.wait()
self.assertEqual(return_code, 0)
child.stdout.close()
child.stderr.close()
del child
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(str(new.seq).replace("-", ""), old.seq)
def test_simple_clustal_strict(self):
"""Simple muscle call using strict Clustal output."""
input_file = "Fasta/f002"
self.assertTrue(os.path.isfile(input_file))
records = list(SeqIO.parse(input_file, "fasta"))
records.sort(key=lambda rec: rec.id) # noqa: E731
# Prepare the command...
cmdline = MuscleCommandline(muscle_exe)
cmdline.set_parameter("in", input_file)
# Use clustal output (with a CLUSTAL header)
cmdline.set_parameter("clwstrict", True) # Default None treated as False!
self.assertEqual(
str(cmdline).rstrip(),
_escape_filename(muscle_exe) + " -in Fasta/f002 -clwstrict",
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
child = subprocess.Popen(
str(cmdline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
# Didn't use -quiet so there should be progress reports on stderr,
align = AlignIO.read(child.stdout, "clustal")
align.sort()
self.assertTrue(child.stderr.read().strip().startswith("MUSCLE"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(str(new.seq).replace("-", ""), old.seq)
return_code = child.wait()
self.assertEqual(return_code, 0)
child.stdout.close()
child.stderr.close()
del child
def test_long(self):
"""Simple muscle call using long file."""
# Create a large input file by converting some of another example file
temp_large_fasta_file = "temp_cw_prot.fasta"
records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40]
SeqIO.write(records, temp_large_fasta_file, "fasta")
# Prepare the command...
cmdline = MuscleCommandline(muscle_exe)
cmdline.set_parameter("in", temp_large_fasta_file)
# Use fast options
cmdline.set_parameter("maxiters", 1)
cmdline.set_parameter("diags", True) # Default None treated as False!
# Use clustal output
cmdline.set_parameter("clwstrict", True) # Default None treated as False!
# Shouldn't need this, but just to make sure it is accepted
cmdline.set_parameter("maxhours", 0.1)
# No progress reports to stderr
cmdline.set_parameter("quiet", True) # Default None treated as False!
self.assertEqual(
str(cmdline).rstrip(),
_escape_filename(muscle_exe)
+ " -in temp_cw_prot.fasta -diags -maxhours 0.1"
+ " -maxiters 1 -clwstrict -quiet",
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
child = subprocess.Popen(
str(cmdline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
align = AlignIO.read(child.stdout, "clustal")
align.sort()
records.sort(key=lambda rec: rec.id) # noqa: E731
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(str(new.seq).replace("-", ""), old.seq)
# See if quiet worked:
self.assertEqual("", child.stderr.read().strip())
return_code = child.wait()
self.assertEqual(return_code, 0)
child.stdout.close()
child.stderr.close()
del child
os.remove(temp_large_fasta_file)
def test_using_stdin(self):
"""Simple alignment using stdin."""
input_file = "Fasta/f002"
self.assertTrue(os.path.isfile(input_file))
records = list(SeqIO.parse(input_file, "fasta"))
# Prepare the command... use Clustal output (with a MUSCLE header)
cline = MuscleCommandline(muscle_exe, clw=True)
self.assertEqual(str(cline).rstrip(), _escape_filename(muscle_exe) + " -clw")
self.assertEqual(str(eval(repr(cline))), str(cline))
child = subprocess.Popen(
str(cline),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
SeqIO.write(records, child.stdin, "fasta")
child.stdin.close()
# Alignment will now run...
align = AlignIO.read(child.stdout, "clustal")
align.sort()
records.sort(key=lambda rec: rec.id) # noqa: E731
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(str(new.seq).replace("-", ""), old.seq)
self.assertEqual(0, child.wait())
child.stdout.close()
child.stderr.close()
del child
def test_with_multiple_output_formats(self):
"""Simple muscle call with multiple output formats."""
input_file = "Fasta/f002"
output_html = "temp_f002.html"
output_clwstrict = "temp_f002.clw"
self.assertTrue(os.path.isfile(input_file))
records = list(SeqIO.parse(input_file, "fasta"))
records.sort(key=lambda rec: rec.id) # noqa: E731
# Prepare the command... use Clustal output (with a MUSCLE header)
cmdline = MuscleCommandline(
muscle_exe,
input=input_file,
clw=True,
htmlout=output_html,
clwstrictout=output_clwstrict,
)
self.assertEqual(
str(cmdline).rstrip(),
_escape_filename(muscle_exe)
+ " -in Fasta/f002 -clw -htmlout temp_f002.html"
+ " -clwstrictout temp_f002.clw",
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
child = subprocess.Popen(
str(cmdline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
# Clustalw on stdout:
align = AlignIO.read(child.stdout, "clustal")
align.sort()
# Didn't use -quiet so there should be progress reports on stderr,
self.assertTrue(child.stderr.read().strip().startswith("MUSCLE"))
return_code = child.wait()
self.assertEqual(return_code, 0)
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
child.stdout.close()
child.stderr.close()
del child
handle = open(output_html)
html = handle.read().strip().upper()
handle.close()
self.assertTrue(html.startswith("<HTML"))
self.assertTrue(html.endswith("</HTML>"))
# ClustalW strict:
align = AlignIO.read(output_clwstrict, "clustal")
align.sort()
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
os.remove(output_html)
os.remove(output_clwstrict)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,465 +0,0 @@
# Copyright 2009-2013 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
#
# This unit test attempts to locate the blastall executable and the nr
# database.
"""Tests for NCBI BLAST tools module."""
import os
import os.path
import re
import subprocess
import sys
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Application import _escape_filename
from Bio.Blast import Applications
# TODO - On windows, can we use the ncbi.ini file?
wanted = [
"blastx",
"blastp",
"blastn",
"tblastn",
"tblastx",
"rpsblast+", # For Debian
"rpsblast",
"rpstblastn",
"psiblast",
"blast_formatter",
"deltablast",
"makeblastdb",
]
exe_names = {}
if sys.platform == "win32":
# The Windows 32 bit BLAST 2.2.22+ installer does add itself to the path,
# and by default installs to C:\Program Files\NCBI\BLAST-2.2.22+\bin
# To keep things simple, assume BLAST+ is on the path on Windows.
#
# On Windows the environment variable name isn't case sensitive,
# but must split on ";" not ":"
likely_dirs = os.environ.get("PATH", "").split(";")
else:
likely_dirs = os.environ.get("PATH", "").split(":")
for folder in likely_dirs:
if not os.path.isdir(folder):
continue
# Loop over copy as will remove entries from wanted:
for name in wanted[:]:
if sys.platform == "win32":
exe_name = os.path.join(folder, name + ".exe")
else:
exe_name = os.path.join(folder, name)
if not os.path.isfile(exe_name):
continue
# To tell the old and new rpsblast apart (since I have both on
# my path and the old blast has priority), try -h as a parameter.
# This should also reject WU-BLAST (since it doesn't like -h).
child = subprocess.Popen(
exe_name + " -h",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
output, error = child.communicate()
if child.returncode == 0 and "ERROR: Invalid argument: -h" not in output:
# Special case, blast_formatter from BLAST 2.2.23+ (i.e. BLAST+)
# has mandatory argument -rid, but no -archive. We don't support it.
if name == "blast_formatter" and " -archive " not in output:
continue
exe_names[name] = exe_name
wanted.remove(name) # can stop search for this now
# else:
# print("Rejecting %r" % exe_name)
del exe_name, name
# To avoid the name clash with legacy BLAST, Debian introduced rpsblast+ alias
if "rpsblast+" in wanted:
wanted.remove("rpsblast+")
if "rpsblast+" in exe_names:
exe_names["rpsblast"] = exe_names["rpsblast+"]
del exe_names["rpsblast+"]
# We can cope with blast_formatter being missing, only added in BLAST 2.2.24+
# We can cope with deltablast being missing, only added in BLAST 2.2.26+
optional = ["blast_formatter", "deltablast"]
if len(set(exe_names).difference(optional)) < len(set(wanted).difference(optional)):
raise MissingExternalDependencyError(
"Install the NCBI BLAST+ command line tools if you want to use the "
"Bio.Blast.Applications wrapper."
)
class Pairwise(unittest.TestCase):
def test_blastp(self):
"""Pairwise BLASTP search."""
global exe_names
cline = Applications.NcbiblastpCommandline(
exe_names["blastp"],
query="Fasta/rose.pro",
subject="GenBank/NC_005816.faa",
evalue=1,
)
self.assertEqual(
str(cline),
_escape_filename(exe_names["blastp"])
+ " -query Fasta/rose.pro -evalue 1"
+ " -subject GenBank/NC_005816.faa",
)
child = subprocess.Popen(
str(cline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
return_code = child.returncode
self.assertEqual(
return_code, 0, "Got error code %i back from:\n%s" % (return_code, cline)
)
# Used to get 10 matches from 10 pairwise searches,
# as of NCBI BLAST+ 2.3.0 only get 1 Query= line:
if stdoutdata.count("Query= ") == 10:
if stdoutdata.count("***** No hits found *****") == 7:
# This happens with BLAST 2.2.26+ which is potentially a bug
pass
else:
self.assertEqual(9, stdoutdata.count("***** No hits found *****"))
else:
# Assume this is NCBI BLAST+ 2.3.0 or later,
self.assertEqual(1, stdoutdata.count("Query= "))
self.assertEqual(0, stdoutdata.count("***** No hits found *****"))
def test_blastn(self):
"""Pairwise BLASTN search."""
global exe_names
cline = Applications.NcbiblastnCommandline(
exe_names["blastn"],
query="GenBank/NC_005816.ffn",
subject="GenBank/NC_005816.fna",
evalue="0.000001",
)
self.assertEqual(
str(cline),
_escape_filename(exe_names["blastn"])
+ " -query GenBank/NC_005816.ffn -evalue 0.000001"
+ " -subject GenBank/NC_005816.fna",
)
child = subprocess.Popen(
str(cline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
return_code = child.returncode
self.assertEqual(
return_code, 0, "Got error code %i back from:\n%s" % (return_code, cline)
)
self.assertEqual(10, stdoutdata.count("Query= "))
self.assertEqual(0, stdoutdata.count("***** No hits found *****"))
# TODO - Parse it?
def test_tblastn(self):
"""Pairwise TBLASTN search."""
global exe_names
cline = Applications.NcbitblastnCommandline(
exe_names["tblastn"],
query="GenBank/NC_005816.faa",
subject="GenBank/NC_005816.fna",
evalue="1e-6",
)
self.assertEqual(
str(cline),
_escape_filename(exe_names["tblastn"])
+ " -query GenBank/NC_005816.faa -evalue 1e-6"
+ " -subject GenBank/NC_005816.fna",
)
child = subprocess.Popen(
str(cline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
return_code = child.returncode
self.assertEqual(
return_code, 0, "Got error code %i back from:\n%s" % (return_code, cline)
)
self.assertEqual(10, stdoutdata.count("Query= "))
self.assertEqual(0, stdoutdata.count("***** No hits found *****"))
# TODO - Parse it?
class BlastDB(unittest.TestCase):
def test_requires_dbtype(self):
"""Check that dbtype throws error if not set."""
global exe_names
cline = Applications.NcbimakeblastdbCommandline(
exe_names["makeblastdb"], input_file="GenBank/NC_005816.faa"
)
with self.assertRaises(ValueError):
str(cline)
def test_fasta_db_prot(self):
"""Test makeblastdb wrapper with protein database."""
global exe_names
cline = Applications.NcbimakeblastdbCommandline(
exe_names["makeblastdb"],
input_file="GenBank/NC_005816.faa",
dbtype="prot",
hash_index=True,
max_file_sz="20MB",
parse_seqids=True,
taxid=10,
)
self.assertEqual(
str(cline),
_escape_filename(exe_names["makeblastdb"])
+ " -dbtype prot -in GenBank/NC_005816.faa"
" -parse_seqids -hash_index -max_file_sz 20MB"
" -taxid 10",
)
child = subprocess.Popen(
str(cline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
return_code = child.returncode
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phd"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phi"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phr"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pin"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pog"))
self.assertTrue(
os.path.isfile("GenBank/NC_005816.faa.psd")
or os.path.isfile("GenBank/NC_005816.faa.pnd")
)
self.assertTrue(
os.path.isfile("GenBank/NC_005816.faa.psi")
or os.path.isfile("GenBank/NC_005816.faa.pni")
)
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.psq"))
def test_fasta_db_prot_legacy(self):
"""Test makeblastdb wrapper with protein database legacy, version 4."""
global exe_names
cline = Applications.NcbimakeblastdbCommandline(
exe_names["makeblastdb"],
blastdb_version=4,
input_file="GenBank/NC_005816.faa",
dbtype="prot",
hash_index=True,
max_file_sz="20MB",
parse_seqids=True,
taxid=10,
)
self.assertEqual(
str(cline),
_escape_filename(exe_names["makeblastdb"]) + " -blastdb_version 4"
" -dbtype prot -in GenBank/NC_005816.faa"
" -parse_seqids -hash_index -max_file_sz 20MB"
" -taxid 10",
)
child = subprocess.Popen(
str(cline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
return_code = child.returncode
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phd"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phi"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phr"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pin"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pog"))
self.assertTrue(
os.path.isfile("GenBank/NC_005816.faa.psd")
or os.path.isfile("GenBank/NC_005816.faa.pnd")
)
self.assertTrue(
os.path.isfile("GenBank/NC_005816.faa.psi")
or os.path.isfile("GenBank/NC_005816.faa.pni")
)
self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.psq"))
def test_fasta_db_nucl(self):
"""Test makeblastdb wrapper with nucleotide database."""
global exe_names
cline = Applications.NcbimakeblastdbCommandline(
exe_names["makeblastdb"],
input_file="GenBank/NC_005816.fna",
dbtype="nucl",
hash_index=True,
max_file_sz="20MB",
parse_seqids=True,
taxid=10,
)
self.assertEqual(
str(cline),
_escape_filename(exe_names["makeblastdb"])
+ " -dbtype nucl -in GenBank/NC_005816.fna"
" -parse_seqids -hash_index -max_file_sz 20MB"
" -taxid 10",
)
child = subprocess.Popen(
str(cline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
return_code = child.returncode
self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nhd"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nhi"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nhr"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nin"))
self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nog"))
self.assertTrue(
os.path.isfile("GenBank/NC_005816.fna.nsd")
or os.path.isfile("GenBank/NC_005816.fna.nnd")
)
self.assertTrue(
os.path.isfile("GenBank/NC_005816.fna.nsi")
or os.path.isfile("GenBank/NC_005816.fna.nni")
)
self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nsq"))
# makeblastdb makes files in the same dir as the input, clean these up
def tearDown(self):
blastdb_matcher_prot = re.compile(r"NC_005816\.faa\.p.+")
for file in os.listdir("GenBank/"):
if blastdb_matcher_prot.match(file):
path = os.path.join("GenBank/", file)
os.remove(path)
blastdb_matcher_nucl = re.compile(r"NC_005816\.fna\.n.+")
for file in os.listdir("GenBank/"):
if blastdb_matcher_nucl.match(file):
path = os.path.join("GenBank/", file)
os.remove(path)
class CheckCompleteArgList(unittest.TestCase):
def check(self, exe_name, wrapper):
global exe_names
exe = exe_names[exe_name]
# dbtype must be set to initialize NcbimakeblastdbCommandline
if exe_name == "makeblastdb":
cline = wrapper(exe, h=True, dbtype="prot")
else:
cline = wrapper(exe, h=True)
names = {parameter.names[0] for parameter in cline.parameters}
child = subprocess.Popen(
str(cline),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=(sys.platform != "win32"),
)
stdoutdata, stderrdata = child.communicate()
self.assertEqual(stderrdata, "", f"{cline}\n{stderrdata}")
names_in_tool = set()
while stdoutdata:
index = stdoutdata.find("[")
if index == -1:
break
stdoutdata = stdoutdata[index + 1 :]
index = stdoutdata.find("]")
assert index != -1
name = stdoutdata[:index]
if " " in name:
name = name.split(None, 1)[0]
names_in_tool.add(name)
stdoutdata = stdoutdata[index + 1 :]
# An almost trivial example to test any validation
if "-query" in names:
cline = wrapper(exe, query="dummy")
elif "-archive" in names:
cline = wrapper(exe, archive="dummy")
str(cline)
def test_blastx(self):
"""Check all blastx arguments are supported."""
self.check("blastx", Applications.NcbiblastxCommandline)
def test_blastp(self):
"""Check all blastp arguments are supported."""
self.check("blastp", Applications.NcbiblastpCommandline)
def test_blastn(self):
"""Check all blastn arguments are supported."""
self.check("blastn", Applications.NcbiblastnCommandline)
def test_tblastx(self):
"""Check all tblastx arguments are supported."""
self.check("tblastx", Applications.NcbitblastxCommandline)
def test_tblastn(self):
"""Check all tblastn arguments are supported."""
self.check("tblastn", Applications.NcbitblastnCommandline)
def test_psiblast(self):
"""Check all psiblast arguments are supported."""
self.check("psiblast", Applications.NcbipsiblastCommandline)
def test_rpsblast(self):
"""Check all rpsblast arguments are supported."""
self.check("rpsblast", Applications.NcbirpsblastCommandline)
def test_rpstblastn(self):
"""Check all rpstblastn arguments are supported."""
self.check("rpstblastn", Applications.NcbirpstblastnCommandline)
def test_makeblastdb(self):
"""Check all makeblastdb arguments are supported."""
self.check("makeblastdb", Applications.NcbimakeblastdbCommandline)
if "blast_formatter" in exe_names:
def test_blast_formatter(self):
"""Check all blast_formatter arguments are supported."""
self.check("blast_formatter", Applications.NcbiblastformatterCommandline)
if "deltablast" in exe_names:
def test_deltablast(self):
"""Check all deltablast arguments are supported."""
self.check("deltablast", Applications.NcbideltablastCommandline)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,109 +0,0 @@
# Copyright 2009 by Tiago Antao <tiagoantao@gmail.com>. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Test GenePop."""
import os
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.PopGen.GenePop.Controller import GenePopController
# Tests genepop related code. Note: this case requires genepop
# test_PopGen_GenePop_nodepend tests code that does not require genepop
found = False
for path in os.environ["PATH"].split(os.pathsep):
try:
for filename in os.listdir(path):
if filename.startswith("Genepop"):
found = True
except OSError:
pass # Path doesn't exist - correct to pass
if not found:
raise MissingExternalDependencyError(
"Install GenePop if you want to use Bio.PopGen.GenePop."
)
class AppTest(unittest.TestCase):
"""Tests genepop execution via biopython."""
def test_allele_genotype_frequencies(self):
"""Test genepop execution on basic allele and genotype frequencies."""
ctrl = GenePopController()
path = os.path.join("PopGen", "big.gen")
pop_iter, locus_iter = ctrl.calc_allele_genotype_freqs(path)
# print("%s %s" % (pop, loci))
# for popc in pop_iter:
# pop_name, loci_content = popc
# print(pop_name)
# for locus in loci_content:
# geno_list, hets, freq_fis = loci_content[locus]
# print(locus)
# print(hets)
# print(freq_fis)
# print(geno_list)
# print("")
def test_calc_diversities_fis_with_identity(self):
"""Test calculations of diversities."""
ctrl = GenePopController()
path = os.path.join("PopGen", "big.gen")
iter, avg_fis, avg_Qintra = ctrl.calc_diversities_fis_with_identity(path)
liter = list(iter)
self.assertEqual(len(liter), 37)
self.assertEqual(liter[0][0], "Locus1")
self.assertEqual(len(avg_fis), 10)
self.assertEqual(len(avg_Qintra), 10)
def test_estimate_nm(self):
"""Test Nm estimation."""
ctrl = GenePopController()
path = os.path.join("PopGen", "big.gen")
(
mean_sample_size,
mean_priv_alleles,
mig10,
mig25,
mig50,
mig_corrected,
) = ctrl.estimate_nm(path)
self.assertAlmostEqual(mean_sample_size, 28.0)
self.assertAlmostEqual(mean_priv_alleles, 0.016129)
self.assertAlmostEqual(mig10, 52.5578)
self.assertAlmostEqual(mig25, 15.3006)
self.assertAlmostEqual(mig50, 8.94583)
self.assertAlmostEqual(mig_corrected, 13.6612)
def test_fst_all(self):
"""Test genepop execution on all fst."""
ctrl = GenePopController()
path = os.path.join("PopGen", "c2line.gen")
(allFis, allFst, allFit), itr = ctrl.calc_fst_all(path)
results = list(itr)
self.assertEqual(len(results), 3)
self.assertEqual(results[0][0], "136255903")
self.assertAlmostEqual(results[1][3], 0.335846)
def test_haploidy(self):
"""Test haploidy."""
ctrl = GenePopController()
path = os.path.join("PopGen", "haplo.gen")
(allFis, allFst, allFit), itr = ctrl.calc_fst_all(path)
litr = list(itr)
self.assertNotIsInstance(allFst, int)
self.assertEqual(len(litr), 37)
self.assertEqual(litr[36][0], "Locus37")
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,128 +0,0 @@
# Copyright 2009 by Tiago Antao <tiagoantao@gmail.com>. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for GenePop easy-controller."""
import os
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.PopGen.GenePop.EasyController import EasyController
# Tests genepop related code for easy controller. Note: this requires genepop
# test_PopGen_GenePop_nodepend tests code that does not require genepop
found = False
for path in os.environ["PATH"].split(os.pathsep):
try:
for filename in os.listdir(path):
if filename.startswith("Genepop"):
found = True
except OSError:
pass # Path doesn't exist - correct to pass
if not found:
raise MissingExternalDependencyError(
"Install GenePop if you want to use Bio.PopGen.GenePop."
)
cur_dir = os.path.abspath(".") # Tests directory
class AppTest(unittest.TestCase):
"""Tests genepop execution via biopython using EasyController."""
def setUp(self):
"""Change working directory."""
# Genepop likes to be on the directory where the file is.
os.chdir("PopGen")
self.ctrl = EasyController("big.gen")
def tearDown(self):
"""Restore working directory."""
os.chdir(cur_dir)
def test_basic_info(self):
"""Test basic info."""
pops, loci = self.ctrl.get_basic_info()
self.assertEqual(len(pops), 10)
self.assertEqual(len(loci), 37)
def test_get_heterozygosity_info(self):
"""Test heterozygosity info."""
hz_info = self.ctrl.get_heterozygosity_info(0, "Locus2")
self.assertEqual(hz_info[1], 24)
self.assertEqual(hz_info[3], 7)
def test_get_alleles(self):
"""Test get alleles."""
# Returns keys of a dict, so order is Python implementation dependent
self.assertCountEqual(self.ctrl.get_alleles(0, "Locus3"), [3, 20])
def test_get_alleles_all_pops(self):
"""Test get alleles for all populations."""
self.assertEqual(self.ctrl.get_alleles_all_pops("Locus4"), [1, 3])
def test_get_fis(self):
"""Test get Fis."""
alleles, overall = self.ctrl.get_fis(0, "Locus2")
self.assertEqual(alleles[3][0], 55)
self.assertEqual(overall[0], 62)
def test_get_allele_frequency(self):
"""Test allele frequency."""
tot_genes, alleles = self.ctrl.get_allele_frequency(0, "Locus2")
self.assertEqual(tot_genes, 62)
self.assertLess(abs(alleles[20] - 0.113), 0.05)
def test_get_genotype_count(self):
"""Test genotype count."""
self.assertEqual(len(self.ctrl.get_genotype_count(0, "Locus2")), 3)
def test_estimate_nm(self):
"""Test Nm estimation."""
nms = self.ctrl.estimate_nm()
self.assertEqual(nms[0], 28.0)
def test_hwe_excess(self):
"""Test Hardy-Weinberg Equilibrium."""
hwe_excess = self.ctrl.test_hw_pop(0, "excess")
self.assertEqual(hwe_excess["Locus1"], (0.4955, None, -0.16, -0.1623, 5))
# These tests are frequently failing, possibly due to a Genepop problem.
# def test_get_avg_fst_pair_locus(self):
# """Test get average Fst for pairwise pops on a locus."""
# self.assertEqual(len(self.ctrl.get_avg_fst_pair_locus("Locus4")), 45)
#
# def test_get_avg_fst_pair(self):
# """Test get pairwise Fst."""
# pop_fis = self.ctrl.get_avg_fst_pair()
# self.assertEqual(len(pop_fis), 45)
def test_get_avg_fis(self):
"""Test average Fis."""
self.ctrl.get_avg_fis()
def test_get_multilocus_f_stats(self):
"""Test multilocus F stats."""
mf = self.ctrl.get_multilocus_f_stats()
self.assertEqual(len(mf), 3)
self.assertLess(mf[0], 0.1)
def test_get_f_stats(self):
"""Test F stats."""
fs = self.ctrl.get_f_stats("Locus2")
self.assertEqual(len(fs), 5)
self.assertLess(fs[0], 0)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,229 +0,0 @@
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unittests for Bio.Align.Applications interface for PRANK."""
import os
import sys
import unittest
import warnings
from Bio import AlignIO
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
from Bio.Nexus.Nexus import NexusError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import PrankCommandline
from Bio.Application import _escape_filename
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
prank_exe = None
if sys.platform == "win32":
try:
# This can vary depending on the Windows language.
prog_files = os.environ["PROGRAMFILES"]
except KeyError:
prog_files = r"C:\Program Files"
# For Windows, PRANK just comes as a zip file which contains the
# prank.exe file which the user could put anywhere. We'll try a few
# sensible locations under Program Files... and then the full path.
likely_dirs = [
"", # Current dir
prog_files,
os.path.join(prog_files, "Prank"),
] + sys.path
for folder in likely_dirs:
if os.path.isdir(folder):
if os.path.isfile(os.path.join(folder, "prank.exe")):
prank_exe = os.path.join(folder, "prank.exe")
break
if prank_exe:
break
else:
from subprocess import getoutput
output = getoutput("prank")
if "not found" not in output and "not recognized" not in output:
if "prank" in output.lower():
prank_exe = "prank"
if not prank_exe:
raise MissingExternalDependencyError(
"Install PRANK if you want to use the Bio.Align.Applications wrapper."
)
class PrankApplication(unittest.TestCase):
def setUp(self):
self.infile1 = "Fasta/fa01"
def tearDown(self):
"""Remove generated files.
output.1.dnd output.1.fas output.1.xml output.2.dnd output.2.fas output.2.xml
"""
if os.path.isfile("output.1.dnd"):
os.remove("output.1.dnd")
if os.path.isfile("output.1.fas"):
os.remove("output.1.fas")
if os.path.isfile("output.1.xml"):
os.remove("output.1.xml")
if os.path.isfile("output.2.dnd"):
os.remove("output.2.dnd")
if os.path.isfile("output.2.fas"):
os.remove("output.2.fas")
if os.path.isfile("output.2.xml"):
os.remove("output.2.xml")
if os.path.isfile("output.1.nex"):
os.remove("output.1.nex")
if os.path.isfile("output.2.nex"):
os.remove("output.2.nex")
def test_Prank_simple(self):
"""Simple round-trip through app with infile.
output.?.??? files written to cwd - no way to redirect
"""
cmdline = PrankCommandline(prank_exe)
cmdline.set_parameter("d", self.infile1)
self.assertEqual(str(cmdline), _escape_filename(prank_exe) + " -d=Fasta/fa01")
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
output, error = cmdline()
self.assertEqual(error, "")
self.assertIn("Total time", output)
def test_Prank_simple_with_NEXUS_output(self):
"""Simple round-trip through app with infile, output in NEXUS.
output.?.??? files written to cwd - no way to redirect
"""
records = list(SeqIO.parse(self.infile1, "fasta"))
# Try using keyword argument,
cmdline = PrankCommandline(prank_exe, d=self.infile1)
# Try using a property,
cmdline.d = self.infile1
cmdline.f = 17 # NEXUS format
cmdline.set_parameter("dots", True)
self.assertEqual(
str(cmdline), _escape_filename(prank_exe) + " -d=Fasta/fa01 -f=17 -dots"
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdout, stderr = cmdline()
self.assertIn("Total time", stdout)
self.assertEqual(stderr, "")
try:
if os.path.isfile("output.best.nex"):
# Prank v.130820 and perhaps earlier use ".best.*" output names
nex_fname = "output.best.nex"
elif os.path.isfile("output.2.nex"):
# Older Prank versions use ".2.*" output names
nex_fname = "output.2.nex"
else:
raise RuntimeError("Can't find PRANK's NEXUS output (*.nex)")
align = AlignIO.read(nex_fname, "nexus")
for old, new in zip(records, align):
# Old versions of Prank reduced name to 9 chars
self.assertTrue(old.id == new.id or old.id[:9] == new.id)
# infile1 has alignment gaps in it
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
except NexusError:
# See bug 3119,
# Bio.Nexus can't parse output from prank v100701 (1 July 2010)
pass
def test_Prank_complex_command_line(self):
"""Round-trip with complex command line."""
cmdline = PrankCommandline(prank_exe)
cmdline.set_parameter("d", self.infile1)
cmdline.set_parameter("-gaprate", 0.321)
cmdline.set_parameter("gapext", 0.6)
cmdline.set_parameter("-dots", 1) # i.e. True
# Try using a property:
cmdline.kappa = 3
cmdline.skipins = True
cmdline.set_parameter("-once", True)
cmdline.realbranches = True
self.assertEqual(
str(cmdline),
_escape_filename(prank_exe)
+ " -d=Fasta/fa01"
+ " -dots -gaprate=0.321 -gapext=0.6 -kappa=3"
+ " -once -skipins -realbranches",
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdout, stderr = cmdline()
self.assertIn("Total time", stdout)
class PrankConversion(unittest.TestCase):
def setUp(self):
# As these reads are all 36, it can be seen as pre-aligned:
self.input = "Quality/example.fasta"
self.output = "temp with space" # prefix, PRANK will pick extensions
def conversion(self, prank_number, prank_ext, format):
"""Get PRANK to do a conversion, and check it with SeqIO."""
filename = f"{self.output}.{prank_ext}"
if os.path.isfile(filename):
os.remove(filename)
cmdline = PrankCommandline(
prank_exe,
d=self.input,
convert=True,
f=prank_number,
o=f'"{self.output}"',
)
self.assertEqual(
str(cmdline),
_escape_filename(prank_exe)
+ f' -d={self.input} -o="{self.output}" -f={prank_number} -convert',
)
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
message, error = cmdline()
self.assertIn("PRANK", message)
self.assertIn((f"converting '{self.input}' to '{filename}'"), message, message)
self.assertEqual(error, "")
self.assertTrue(os.path.isfile(filename))
old = AlignIO.read(self.input, "fasta")
# Hack...
if format == "phylip":
for record in old:
record.id = record.id[:10]
new = AlignIO.read(filename, format)
self.assertEqual(len(old), len(new))
for old_r, new_r in zip(old, new):
self.assertEqual(old_r.id, new_r.id)
self.assertEqual(old_r.seq, new_r.seq)
os.remove(filename)
def test_convert_to_fasta(self):
"""Convert FASTA to FASTA format."""
self.conversion(8, "fas", "fasta")
# Prank v.100701 seems to output an invalid file here...
# def test_convert_to_phylip32(self):
# """Convert FASTA to PHYLIP 3.2 format."""
# self.conversion(11, "phy", "phylip")
def test_convert_to_phylip(self):
"""Convert FASTA to PHYLIP format."""
self.conversion(12, "phy", "phylip")
# PRANK truncated the record names in the matrix block. An error?
# def test_convert_to_paup_nexus(self):
# """Convert FASTA to PAUP/NEXUS."""
# self.conversion(17, "nex", "nexus")
# We don't support format 18, PAML
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,105 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unittests for Bio.Align.Applications interface for PROBCONS."""
import os
import sys
import unittest
import warnings
from io import StringIO
from Bio import AlignIO
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import ProbconsCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
probcons_exe = None
if sys.platform == "win32":
raise MissingExternalDependencyError("PROBCONS not available on Windows")
else:
from subprocess import getoutput
output = getoutput("probcons")
if "not found" not in output and "not recognized" not in output:
if "probcons" in output.lower():
probcons_exe = "probcons"
if not probcons_exe:
raise MissingExternalDependencyError(
"Install PROBCONS if you want to use the Bio.Align.Applications wrapper."
)
class ProbconsApplication(unittest.TestCase):
def setUp(self):
self.infile1 = "Fasta/fa01"
self.annotation_outfile = "Fasta/probcons_annot.out"
def tearDown(self):
if os.path.isfile(self.annotation_outfile):
os.remove(self.annotation_outfile)
def test_Probcons_alignment_fasta(self):
"""Round-trip through app and read fasta alignment from stdout."""
cmdline = ProbconsCommandline(probcons_exe, input=self.infile1)
self.assertEqual(str(cmdline), probcons_exe + " Fasta/fa01")
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdout, stderr = cmdline()
self.assertTrue(stderr.startswith("\nPROBCONS"))
align = AlignIO.read(StringIO(stdout), "fasta")
records = list(SeqIO.parse(self.infile1, "fasta"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
def test_Probcons_alignment_clustalw(self):
"""Round-trip through app and read clustalw alignment from stdout."""
cmdline = ProbconsCommandline(probcons_exe)
cmdline.set_parameter("input", "Fasta/fa01")
cmdline.clustalw = True
self.assertEqual(str(cmdline), probcons_exe + " -clustalw Fasta/fa01")
self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
stdout, stderr = cmdline()
self.assertTrue(stderr.strip().startswith("PROBCONS"))
align = AlignIO.read(StringIO(stdout), "clustal")
records = list(SeqIO.parse(self.infile1, "fasta"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
def test_Probcons_complex_commandline(self):
"""Round-trip through app with complex command line and output file."""
cmdline = ProbconsCommandline(probcons_exe, pre=1)
cmdline.set_parameter("input", "Fasta/fa01")
cmdline.consistency = 4
cmdline.set_parameter("--iterative-refinement", 222)
cmdline.set_parameter("a", True)
cmdline.annot = self.annotation_outfile
self.assertEqual(
str(cmdline),
probcons_exe
+ " -c 4 -ir 222 -pre 1 -annot Fasta/probcons_annot.out -a Fasta/fa01",
)
stdout, stderr = cmdline()
self.assertTrue(stderr.startswith("\nPROBCONS"))
self.assertTrue(stdout.startswith(">AK1H_ECOLI/1-378"))
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,184 +0,0 @@
# Copyright 2009 by Cymon J. Cox. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unittests for Bio.Align.Applications interface for TCOFFEE."""
import os
import sys
import unittest
import warnings
from Bio import AlignIO
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Align.Applications import TCoffeeCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
t_coffee_exe = None
if sys.platform == "win32":
raise MissingExternalDependencyError("Testing TCOFFEE on Windows not supported yet")
else:
from subprocess import getoutput
output = getoutput("t_coffee -version")
if "not found" not in output and "not recognized" not in output:
if "t_coffee" in output.lower() or "t-coffee" in output.lower():
t_coffee_exe = "t_coffee"
if not t_coffee_exe:
raise MissingExternalDependencyError(
"Install TCOFFEE if you want to use the Bio.Align.Applications wrapper."
)
class TCoffeeApplication(unittest.TestCase):
def setUp(self):
self.infile1 = "Fasta/fa01"
# TODO: Use a temp dir for the output files:
self.outfile1 = "fa01.aln"
self.outfile2 = "fa01.html" # Written by default when no output set
self.outfile3 = "Fasta/tc_out.pir"
self.outfile4 = "Fasta/tc_out.aln"
self.outfile5 = "Fasta/tc_out.phy"
self.outfile6 = "Fasta/tc_out.msf"
def tearDown(self):
if os.path.isfile(self.outfile1):
os.remove(self.outfile1)
if os.path.isfile(self.outfile2):
os.remove(self.outfile2)
if os.path.isfile(self.outfile3):
os.remove(self.outfile3)
if os.path.isfile(self.outfile4):
os.remove(self.outfile4)
if os.path.isfile(self.outfile5):
os.remove(self.outfile5)
def test_TCoffee_fasta(self):
"""Round-trip through app and read clustal alignment from file."""
cmdline = TCoffeeCommandline(t_coffee_exe, infile=self.infile1)
self.assertEqual(str(cmdline), t_coffee_exe + " -infile Fasta/fa01")
stdout, stderr = cmdline()
self.assertTrue(stderr.strip().startswith("PROGRAM: T-COFFEE"))
align = AlignIO.read(self.outfile1, "clustal")
records = list(SeqIO.parse(self.infile1, "fasta"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
def test_TCoffee_pir(self):
"""Round-trip through app and read pir alignment from file."""
cmdline = TCoffeeCommandline(t_coffee_exe, quiet=True)
cmdline.infile = self.infile1
cmdline.outfile = self.outfile3
cmdline.output = "pir_aln"
self.assertEqual(
str(cmdline),
t_coffee_exe
+ " -output pir_aln -infile Fasta/fa01 -outfile Fasta/tc_out.pir -quiet",
)
stdout, stderr = cmdline()
# Can get warnings in stderr output
self.assertNotIn("error", stderr.lower(), stderr)
align = AlignIO.read(self.outfile3, "pir")
records = list(SeqIO.parse(self.infile1, "fasta"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
def test_TCoffee_clustalw(self):
"""Round-trip through app and read clustalw alignment from file."""
cmdline = TCoffeeCommandline(t_coffee_exe, gapopen=-2)
cmdline.infile = self.infile1
cmdline.outfile = self.outfile4
cmdline.set_parameter("output", "clustalw_aln")
cmdline.outorder = "input"
cmdline.set_parameter("gapext", -5)
cmdline.type = "protein"
self.assertEqual(
str(cmdline),
t_coffee_exe
+ " -output clustalw_aln -infile Fasta/fa01 -outfile Fasta/tc_out.aln "
"-type protein -outorder input -gapopen -2 -gapext -5",
)
stdout, stderr = cmdline()
self.assertTrue(stderr.strip().startswith("PROGRAM: T-COFFEE"))
align = AlignIO.read(self.outfile4, "clustal")
records = list(SeqIO.parse(self.infile1, "fasta"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
def test_TCoffee_phylip(self):
"""Round-trip through app and read PHYLIP alignment from file."""
cmdline = TCoffeeCommandline(
t_coffee_exe,
infile=self.infile1,
outfile=self.outfile5,
quiet=True,
output="phylip_aln",
)
self.assertEqual(
str(cmdline),
t_coffee_exe + " -output phylip_aln "
"-infile Fasta/fa01 -outfile Fasta/tc_out.phy -quiet",
)
stdout, stderr = cmdline()
# Can get warnings in stderr output
self.assertNotIn("error", stderr.lower(), stderr)
align = AlignIO.read(self.outfile5, "phylip")
records = list(SeqIO.parse(self.infile1, "fasta"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
# TCoffee does strict 10 character truncation as per original PHYLIP
self.assertEqual(old.id[:10], new.id[:10])
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
def test_TCoffee_msf(self):
"""Round-trip through app and read GCG MSF alignment from file."""
cmdline = TCoffeeCommandline(
t_coffee_exe,
infile=self.infile1,
outfile=self.outfile6,
quiet=True,
output="msf_aln",
)
self.assertEqual(
str(cmdline),
t_coffee_exe
+ " -output msf_aln -infile Fasta/fa01 -outfile Fasta/tc_out.msf -quiet",
)
stdout, stderr = cmdline()
# Can get warnings in stderr output
self.assertNotIn("error", stderr.lower(), stderr)
align = AlignIO.read(self.outfile6, "msf")
records = list(SeqIO.parse(self.infile1, "fasta"))
self.assertEqual(len(records), len(align))
for old, new in zip(records, align):
self.assertEqual(old.id, new.id)
self.assertEqual(
str(new.seq).replace("-", ""), str(old.seq).replace("-", "")
)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,188 +0,0 @@
# Copyright 2012 by Christian Brueffer. All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for XXmotif tool."""
import glob
import os
import shutil
import sys
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import SeqIO
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Application import ApplicationError
from Bio.motifs.applications import XXmotifCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
xxmotif_exe = None
if sys.platform == "win32":
# TODO
raise MissingExternalDependencyError(
"Testing this on Windows is not implemented yet"
)
else:
from subprocess import getoutput
output = getoutput("XXmotif")
if output.find("== XXmotif version") != -1:
xxmotif_exe = "XXmotif"
if not xxmotif_exe:
raise MissingExternalDependencyError(
"Install XXmotif if you want to use XXmotif from Biopython."
)
class XXmotifTestCase(unittest.TestCase):
def setUp(self):
self.out_dir = "xxmotif-temp"
self.files_to_clean = set()
def tearDown(self):
for filename in self.files_to_clean:
if os.path.isfile(filename):
os.remove(filename)
if os.path.isdir(self.out_dir):
shutil.rmtree(self.out_dir)
def standard_test_procedure(self, cline):
"""Shared test procedure used by all tests."""
output, error = cline()
self.assertTrue(os.path.isdir(self.out_dir))
self.assertTrue(glob.glob(os.path.join(self.out_dir, "*.meme")))
self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_MotifFile.txt")))
self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_Pvals.txt")))
self.assertTrue(glob.glob(os.path.join(self.out_dir, "*.pwm")))
self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_sequence.txt")))
# TODO
# Parsing the MEME file would be nice, but unfortunately the
# MEME parser does not like what XXmotif produces yet.
def copy_and_mark_for_cleanup(self, path):
"""Copy file to working directory and marks it for removal.
XXmotif currently only handles a canonical filename as input, no paths.
This method copies the specified file in the specified path to the
current working directory and marks it for removal.
"""
filename = os.path.split(path)[1]
shutil.copyfile(path, filename)
self.add_file_to_clean(filename)
return filename
def add_file_to_clean(self, filename):
"""Add a file for deferred removal by the tearDown routine."""
self.files_to_clean.add(filename)
class XXmotifTestErrorConditions(XXmotifTestCase):
def test_empty_file(self):
"""Test a non-existing input file."""
input_file = "does_not_exist.fasta"
self.assertFalse(os.path.isfile(input_file))
cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
self.assertEqual(err.returncode, 255)
else:
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
def test_invalid_format(self):
"""Test an input file in an invalid format."""
input_file = self.copy_and_mark_for_cleanup("Medline/pubmed_result1.txt")
cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file)
try:
stdout, stderr = cline()
except ApplicationError as err:
self.assertEqual(err.returncode, 255)
else:
self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}")
def test_output_directory_with_space(self):
"""Test an output directory containing a space."""
temp_out_dir = "xxmotif test"
input_file = self.copy_and_mark_for_cleanup("Fasta/f002")
try:
XXmotifCommandline(outdir=temp_out_dir, seqfile=input_file)
except ValueError:
pass
else:
self.fail("expected ValueError")
class XXmotifTestNormalConditions(XXmotifTestCase):
def test_fasta_one_sequence(self):
"""Test a fasta input file containing only one sequence."""
record = list(SeqIO.parse("Registry/seqs.fasta", "fasta"))[0]
input_file = "seq.fasta"
with open(input_file, "w") as handle:
SeqIO.write(record, handle, "fasta")
cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
def test_properties(self):
"""Test setting options via properties."""
input_file = self.copy_and_mark_for_cleanup("Fasta/f002")
cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file)
cline.revcomp = True
cline.pseudo = 20
cline.startmotif = "ACGGGT"
self.standard_test_procedure(cline)
def test_large_fasta_file(self):
"""Test a large fasta input file."""
records = list(SeqIO.parse("NBRF/B_nuc.pir", "pir"))
input_file = "temp_b_nuc.fasta"
with open(input_file, "w") as handle:
SeqIO.write(records, handle, "fasta")
cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
def test_input_filename_with_space(self):
"""Test an input filename containing a space."""
records = SeqIO.parse("Phylip/hennigian.phy", "phylip")
input_file = "temp horses.fasta"
with open(input_file, "w") as handle:
SeqIO.write(records, handle, "fasta")
cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file)
self.add_file_to_clean(input_file)
self.standard_test_procedure(cline)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,87 +0,0 @@
# Copyright (C) 2012 by Eric Talevich.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unit tests for Bio.Phylo.Applications wrappers."""
import os
import sys
import unittest
import warnings
from subprocess import getoutput
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import Phylo
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Phylo.Applications import PhymlCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
phyml_exe = None
exe_name = "PhyML-3.1_win32.exe" if sys.platform == "win32" else "phyml"
output = getoutput(exe_name + " --version")
# Looks like this:
# . This is PhyML version 20120412.
if "20" in output and "PhyML" in output:
phyml_exe = exe_name
if not phyml_exe:
raise MissingExternalDependencyError(
"Couldn't find the PhyML software. Install PhyML 3.0 or later if you want "
"to use the Bio.Phylo.Applications wrapper."
)
# Example Phylip file with 4 aligned protein sequences
EX_PHYLIP = "Phylip/interlaced2.phy"
class AppTests(unittest.TestCase):
"""Tests for application wrappers."""
def test_phyml(self):
"""Run PhyML using the wrapper."""
# Stabilize phyml tests by running in single threaded mode by default.
# Note: PHYMLCPUS environment is specific to Debian and derivatives.
if not os.getenv("PHYMLCPUS"):
os.putenv("PHYMLCPUS", "1")
cmd = PhymlCommandline(phyml_exe, input=EX_PHYLIP, datatype="aa")
# Smoke test
try:
out, err = cmd()
self.assertGreater(len(out), 0)
self.assertEqual(len(err), 0)
# Check the output tree
outfname = EX_PHYLIP + "_phyml_tree.txt"
if not os.path.isfile(outfname):
# NB: Briefly, PhyML dropped the .txt suffix (#919)
outfname = outfname[:-4]
tree = Phylo.read(outfname, "newick")
self.assertEqual(tree.count_terminals(), 4)
except Exception as exc:
self.fail(f"PhyML wrapper error: {exc}")
finally:
# Clean up generated files
for suffix in [
"_phyml_tree.txt",
"_phyml_tree",
"_phyml_stats.txt",
"_phyml_stats",
]:
fname = EX_PHYLIP + suffix
if os.path.isfile(fname):
os.remove(fname)
# ---------------------------------------------------------
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,79 +0,0 @@
# Copyright (C) 2012 by Eric Talevich.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unit tests for Bio.Phylo.Applications wrappers."""
import os
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
from Bio import Phylo
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Phylo.Applications import RaxmlCommandline
raxml_exe = None
try:
from subprocess import getoutput
output = getoutput("raxmlHPC -v")
if "not found" not in output and "not recognized" not in output:
if "This is RAxML" in output:
raxml_exe = "raxmlHPC"
except FileNotFoundError:
pass
if not raxml_exe:
raise MissingExternalDependencyError(
"Install RAxML (binary raxmlHPC) if you want"
" to test the Bio.Phylo.Applications wrapper."
)
# Example Phylip file with 4 aligned protein sequences
EX_PHYLIP = "Phylip/interlaced2.phy"
class AppTests(unittest.TestCase):
"""Tests for application wrappers."""
def test_raxml(self):
"""Run RAxML using the wrapper."""
cmd = RaxmlCommandline(
raxml_exe, sequences=EX_PHYLIP, model="PROTCATWAG", name="test"
)
# The parsimony seed should be set automatically
self.assertIn("-p", str(cmd))
# Smoke test
try:
out, err = cmd()
self.assertGreater(len(out), 0)
self.assertEqual(len(err), 0)
# Check the output tree
tree = Phylo.read("RAxML_result.test", "newick")
self.assertEqual(tree.count_terminals(), 4)
finally:
# Remove RAxML-generated files, or RAxML will complain bitterly
# during the next run
for fname in [
"RAxML_info.test",
"RAxML_log.test",
"RAxML_parsimonyTree.test",
"RAxML_result.test",
# Present in 7.2.X+ but not 7.0.4:
"RAxML_bestTree.test",
]:
if os.path.isfile(fname):
os.remove(fname)
# ---------------------------------------------------------
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -1,274 +0,0 @@
# Copyright 2014 by Saket Choudhary. Based on test_Clustalw_tool.py by Peter
# Cock .
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
# Last Checked with samtools [0.1.18 (r982:295)]
"""Tests for samtools tool."""
import os
import sys
import unittest
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import MissingExternalDependencyError
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=BiopythonDeprecationWarning)
from Bio.Application import ApplicationError
from Bio.Sequencing.Applications import SamtoolsCalmdCommandline
from Bio.Sequencing.Applications import SamtoolsCatCommandline
from Bio.Sequencing.Applications import SamtoolsFaidxCommandline
from Bio.Sequencing.Applications import SamtoolsIdxstatsCommandline
from Bio.Sequencing.Applications import SamtoolsIndexCommandline
from Bio.Sequencing.Applications import SamtoolsMergeCommandline
from Bio.Sequencing.Applications import SamtoolsMpileupCommandline
from Bio.Sequencing.Applications import SamtoolsSortCommandline
from Bio.Sequencing.Applications import SamtoolsVersion1xSortCommandline
from Bio.Sequencing.Applications import SamtoolsViewCommandline
# TODO from Bio.Sequencing.Applications import SamtoolsPhaseCommandline
# TODO from Bio.Sequencing.Applications import SamtoolsReheaderCommandline
# TODO from Bio.Sequencing.Applications import SamtoolsRmdupCommandline
# TODO from Bio.Sequencing.Applications import SamtoolsTargetcutCommandline
# TODO from Bio.Sequencing.Applications import SamtoolsFixmateCommandline
#################################################################
SamtoolsVersion0xSortCommandline = SamtoolsSortCommandline
# Try to avoid problems when the OS is in another language
os.environ["LANG"] = "C"
samtools_exe = None
if sys.platform == "win32":
# TODO - Check the path?
try:
# This can vary depending on the Windows language.
prog_files = os.environ["PROGRAMFILES"]
except KeyError:
prog_files = r"C:\Program Files"
# By default tries C:\Program Files\samtools\samtools.exe
# or C:\Program Files\samtools.exe was chosen
likely_dirs = ["samtools", ""]
likely_exes = ["samtools.exe"]
for folder in likely_dirs:
if os.path.isdir(os.path.join(prog_files, folder)):
for filename in likely_exes:
if os.path.isfile(os.path.join(prog_files, folder, filename)):
samtools_exe = os.path.join(prog_files, folder, filename)
break
if samtools_exe:
break
else:
from subprocess import getoutput
output = getoutput("samtools")
# Since "not found" may be in another language, try and be sure this is
# really the samtools tool's output
if (
"not found" not in output
and "samtools (Tools for alignments in the SAM format)" in output
):
samtools_exe = "samtools"
if not samtools_exe:
raise MissingExternalDependencyError(
"Install samtools and correctly set the file path to "
"the program if you want to use it from Biopython"
)
class SamtoolsTestCase(unittest.TestCase):
"""Class for implementing Samtools test cases."""
def setUp(self):
self.files_to_clean = set()
self.samfile1 = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "sam1.sam"
)
self.reference = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"BWA",
"human_g1k_v37_truncated.fasta",
)
self.referenceindexfile = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"BWA",
"human_g1k_v37_truncated.fasta.fai",
)
self.samfile2 = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "sam2.sam"
)
self.bamfile1 = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam1.bam"
)
self.bamfile2 = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam2.bam"
)
self.outsamfile = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "out.sam"
)
self.outbamfile = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "out.bam"
)
self.bamindexfile1 = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam1.bam.bai"
)
self.sortedbamfile1 = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam1_sorted.bam"
)
self.sortedbamfile2 = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam2_sorted.bam"
)
self.files_to_clean = [
self.referenceindexfile,
self.bamindexfile1,
self.outbamfile,
]
def tearDown(self):
for filename in self.files_to_clean:
if os.path.isfile(filename):
os.remove(filename)
def test_view(self):
"""Test for samtools view."""
cmdline = SamtoolsViewCommandline(samtools_exe)
cmdline.set_parameter("input_file", self.bamfile1)
stdout_bam, stderr_bam = cmdline()
self.assertTrue(
stderr_bam.startswith(""),
f"SAM file viewing failed: \n{cmdline}\nStdout:{stdout_bam}",
)
cmdline.set_parameter("input_file", self.samfile1)
cmdline.set_parameter("S", True)
stdout_sam, stderr_sam = cmdline()
self.assertTrue(
stdout_sam.startswith("HWI-1KL120:88:D0LRBACXX:1:1101:1780:2146"),
f"SAM file viewing failed:\n{cmdline}\nStderr:{stderr_sam}",
)
def create_fasta_index(self):
"""Create index for reference fasta sequence."""
cmdline = SamtoolsFaidxCommandline(samtools_exe)
cmdline.set_parameter("reference", self.reference)
stdout, stderr = cmdline()
def create_bam_index(self, input_bam):
"""Create index of an input bam file."""
cmdline = SamtoolsIndexCommandline(samtools_exe)
cmdline.set_parameter("input_bam", input_bam)
stdout, stderr = cmdline()
def test_faidx(self):
cmdline = SamtoolsFaidxCommandline(samtools_exe)
cmdline.set_parameter("reference", self.reference)
stdout, stderr = cmdline()
self.assertFalse(stderr, f"Samtools faidx failed:\n{cmdline}\nStderr:{stderr}")
self.assertTrue(os.path.isfile(self.referenceindexfile))
def test_calmd(self):
"""Test for samtools calmd."""
self.create_fasta_index()
cmdline = SamtoolsCalmdCommandline(samtools_exe)
cmdline.set_parameter("reference", self.reference)
cmdline.set_parameter("input_bam", self.bamfile1)
# If there is no index file for the reference
# samtools calmd creates one at the time of calling
if os.path.exists(self.referenceindexfile):
# print("exists")
stderr_calmd_expected = ""
else:
# print("doesn't exist")
stderr_calmd_expected = "[fai_load] build FASTA index.\n"
stdout, stderr = cmdline()
self.assertEqual(stderr, stderr_calmd_expected)
def test_cat(self):
cmdline = SamtoolsCatCommandline(samtools_exe)
cmdline.set_parameter("o", self.outbamfile)
cmdline.set_parameter("input_bam", [self.bamfile1, self.bamfile2])
stdout, stderr = cmdline()
self.assertEqual(stderr, "")
# TODO: def test_fixmate(self):
def test_sort(self):
cmdline = SamtoolsVersion0xSortCommandline(samtools_exe)
cmdline.set_parameter("input", self.bamfile1)
cmdline.set_parameter("out_prefix", "SamBam/out")
try:
stdout, stderr = cmdline()
except ApplicationError as err:
if (
"[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files"
in str(err)
):
cmdline = SamtoolsVersion1xSortCommandline(samtools_exe)
cmdline.set_parameter("input", self.bamfile1)
cmdline.set_parameter("-T", "out")
cmdline.set_parameter("-o", "out.bam")
try:
stdout, stderr = cmdline()
except ApplicationError:
raise
else:
raise
self.assertFalse(stderr, f"Samtools sort failed:\n{cmdline}\nStderr:{stderr}")
def test_index(self):
cmdline = SamtoolsIndexCommandline(samtools_exe)
cmdline.set_parameter("input_bam", self.bamfile1)
stdout, stderr = cmdline()
self.assertFalse(stderr, f"Samtools index failed:\n{cmdline}\nStderr:{stderr}")
self.assertTrue(os.path.exists(self.bamindexfile1))
def test_idxstats(self):
self.create_bam_index(self.bamfile1)
cmdline = SamtoolsIdxstatsCommandline(samtools_exe)
cmdline.set_parameter("input_bam", self.bamfile1)
stdout, stderr = cmdline()
self.assertFalse(
stderr, f"Samtools idxstats failed:\n{cmdline}\nStderr:{stderr}"
)
def test_merge(self):
cmdline = SamtoolsMergeCommandline(samtools_exe)
cmdline.set_parameter("input_bam", [self.bamfile1, self.bamfile2])
cmdline.set_parameter("out_bam", self.outbamfile)
cmdline.set_parameter("f", True) # Overwrite out.bam if it exists
stdout, stderr = cmdline()
# Worked up to v1.2, then there was a regression failing with message
# but as of v1.3 expect a warning: [W::bam_merge_core2] No @HD tag found.
self.assertTrue(
not stderr or stderr.strip() == "[W::bam_merge_core2] No @HD tag found.",
f"Samtools merge failed:\n{cmdline}\nStderr:{stderr}",
)
self.assertTrue(os.path.exists(self.outbamfile))
def test_mpileup(self):
cmdline = SamtoolsMpileupCommandline(samtools_exe)
cmdline.set_parameter("input_file", [self.bamfile1])
stdout, stderr = cmdline()
self.assertNotIn("[bam_pileup_core]", stdout)
def test_mpileup_list(self):
cmdline = SamtoolsMpileupCommandline(samtools_exe)
cmdline.set_parameter("input_file", [self.sortedbamfile1, self.sortedbamfile2])
stdout, stderr = cmdline()
self.assertNotIn("[bam_pileup_core]", stdout)
# TODO: def test_phase(self):
# TODO: def test_reheader(self):
# TODO: def test_rmdup(self):
# TODO: def test_targetcut(self):
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -126,11 +126,9 @@ PACKAGES = [
"Bio",
"Bio.Affy",
"Bio.Align",
"Bio.Align.Applications",
"Bio.Align.substitution_matrices",
"Bio.AlignIO",
"Bio.Alphabet",
"Bio.Application",
"Bio.Blast",
"Bio.CAPS",
"Bio.Cluster",
@ -154,7 +152,6 @@ PACKAGES = [
"Bio.KEGG.KGML",
"Bio.Medline",
"Bio.motifs",
"Bio.motifs.applications",
"Bio.motifs.jaspar",
"Bio.Nexus",
"Bio.NMR",
@ -176,12 +173,10 @@ PACKAGES = [
"Bio.SeqIO",
"Bio.SeqUtils",
"Bio.Sequencing",
"Bio.Sequencing.Applications",
"Bio.SVDSuperimposer",
"Bio.SwissProt",
"Bio.TogoWS",
"Bio.Phylo",
"Bio.Phylo.Applications",
"Bio.Phylo.PAML",
"Bio.UniGene",
"Bio.UniProt",