mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
1605 lines
66 KiB
Python
1605 lines
66 KiB
Python
# Copyright 2001 Brad Chapman.
|
|
# Revisions copyright 2009-2010 by Peter Cock.
|
|
# Revisions copyright 2010 by Phillip Garland.
|
|
# All rights reserved.
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Definitions for interacting with BLAST related applications (OBSOLETE).
|
|
|
|
Wrappers for the new NCBI BLAST+ tools (written in C++):
|
|
|
|
- NcbiblastpCommandline - Protein-Protein BLAST
|
|
- NcbiblastnCommandline - Nucleotide-Nucleotide BLAST
|
|
- NcbiblastxCommandline - Translated Query-Protein Subject BLAST
|
|
- NcbitblastnCommandline - Protein Query-Translated Subject BLAST
|
|
- NcbitblastxCommandline - Translated Query-Protein Subject BLAST
|
|
- NcbipsiblastCommandline - Position-Specific Initiated BLAST
|
|
- NcbirpsblastCommandline - Reverse Position Specific BLAST
|
|
- NcbirpstblastnCommandline - Translated Reverse Position Specific BLAST
|
|
- NcbideltablastCommandline - Protein-Protein domain enhanced lookup time accelerated blast
|
|
- NcbiblastformatterCommandline - Convert ASN.1 to other BLAST output formats
|
|
- NcbimakeblastdbCommandline - Application to create BLAST databases
|
|
|
|
For further details, see:
|
|
|
|
Camacho et al. BLAST+: architecture and applications
|
|
BMC Bioinformatics 2009, 10:421
|
|
https://doi.org/10.1186/1471-2105-10-421
|
|
|
|
We have decided to remove this module in future, and instead recommend
|
|
building your command and invoking it via the subprocess module directly.
|
|
"""
|
|
|
|
from Bio.Application import _Option
|
|
from Bio.Application import _Switch
|
|
from Bio.Application import AbstractCommandline
|
|
|
|
|
|
class _NcbibaseblastCommandline(AbstractCommandline):
|
|
"""Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
|
|
|
|
This is provided for subclassing, it deals with shared options
|
|
common to all the BLAST tools (blastn, rpsblast, rpsblast, etc
|
|
AND blast_formatter).
|
|
"""
|
|
|
|
def __init__(self, cmd=None, **kwargs):
|
|
assert cmd is not None
|
|
extra_parameters = [
|
|
# Core:
|
|
_Switch(
|
|
["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments."
|
|
),
|
|
_Switch(
|
|
["-help", "help"],
|
|
"Print USAGE, DESCRIPTION and ARGUMENTS description; "
|
|
"ignore other arguments.",
|
|
),
|
|
_Switch(
|
|
["-version", "version"],
|
|
"Print version number; ignore other arguments.",
|
|
),
|
|
# Output configuration options
|
|
_Option(
|
|
["-out", "out"],
|
|
"Output file for alignment.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
# Formatting options:
|
|
_Option(
|
|
["-outfmt", "outfmt"],
|
|
"Alignment view. Typically an integer 0-14 but for some "
|
|
"formats can be named columns like '6 qseqid sseqid'. "
|
|
"Use 5 for XML output (differs from classic BLAST which "
|
|
"used 7 for XML).",
|
|
filename=True, # to ensure spaced inputs are quoted
|
|
equate=False,
|
|
),
|
|
# TODO - Document and test the column options
|
|
_Switch(["-show_gis", "show_gis"], "Show NCBI GIs in deflines?"),
|
|
_Option(
|
|
["-num_descriptions", "num_descriptions"],
|
|
"Number of database sequences to show one-line descriptions for.\n\n"
|
|
"Integer argument (at least zero). Default is 500. "
|
|
"See also num_alignments.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-num_alignments", "num_alignments"],
|
|
"Number of database sequences to show num_alignments for.\n\n"
|
|
"Integer argument (at least zero). Default is 200. "
|
|
"See also num_alignments.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-line_length", "line_length"],
|
|
"Line length for formatting alignments "
|
|
"(integer, at least 1, default 60).\n\n"
|
|
"Not applicable for outfmt > 4. Added in BLAST+ 2.2.30.",
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-html", "html"], "Produce HTML output? See also the outfmt option."
|
|
),
|
|
# Miscellaneous options
|
|
_Switch(
|
|
["-parse_deflines", "parse_deflines"],
|
|
"Should the query and subject defline(s) be parsed?",
|
|
),
|
|
]
|
|
try:
|
|
# Insert extra parameters - at the start just in case there
|
|
# are any arguments which must come last:
|
|
self.parameters = extra_parameters + self.parameters
|
|
except AttributeError:
|
|
# Should we raise an error? The subclass should have set this up!
|
|
self.parameters = extra_parameters
|
|
AbstractCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate_incompatibilities(self, incompatibles):
|
|
"""Validate parameters for incompatibilities (PRIVATE).
|
|
|
|
Used by the _validate method.
|
|
"""
|
|
for a in incompatibles:
|
|
if self._get_parameter(a):
|
|
for b in incompatibles[a]:
|
|
if self._get_parameter(b):
|
|
raise ValueError(f"Options {a} and {b} are incompatible.")
|
|
|
|
|
|
class _NcbiblastCommandline(_NcbibaseblastCommandline):
|
|
"""Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
|
|
|
|
This is provided for subclassing, it deals with shared options
|
|
common to all the BLAST tools (blastn, rpsblast, rpsblast, etc).
|
|
"""
|
|
|
|
def __init__(self, cmd=None, **kwargs):
|
|
assert cmd is not None
|
|
extra_parameters = [
|
|
# Input query options:
|
|
_Option(
|
|
["-query", "query"],
|
|
"The sequence to search with.",
|
|
filename=True,
|
|
equate=False,
|
|
), # Should this be required?
|
|
_Option(
|
|
["-query_loc", "query_loc"],
|
|
"Location on the query sequence (Format: start-stop).",
|
|
equate=False,
|
|
),
|
|
# General search options:
|
|
_Option(["-db", "db"], "The database to BLAST against.", equate=False),
|
|
_Option(["-evalue", "evalue"], "Expectation value cutoff.", equate=False),
|
|
_Option(
|
|
["-word_size", "word_size"],
|
|
"Word size for wordfinder algorithm.\n\nInteger. Minimum 2.",
|
|
equate=False,
|
|
),
|
|
# BLAST-2-Sequences options:
|
|
# - see subclass
|
|
# Formatting options:
|
|
# - see baseclass
|
|
# Query filtering options
|
|
_Option(
|
|
["-soft_masking", "soft_masking"],
|
|
"Apply filtering locations as soft masks (Boolean, Default = true).",
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-lcase_masking", "lcase_masking"],
|
|
"Use lower case filtering in query and subject sequence(s)?",
|
|
),
|
|
# Restrict search or results
|
|
_Option(
|
|
["-gilist", "gilist"],
|
|
"Restrict search of database to list of GI's.\n\n"
|
|
"Incompatible with: negative_gilist, seqidlist, negative_seqidlist, "
|
|
"remote, subject, subject_loc",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-negative_gilist", "negative_gilist"],
|
|
"Restrict search of database to everything except the listed GIs.\n\n"
|
|
"Incompatible with: gilist, seqidlist, remote, subject, subject_loc",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-seqidlist", "seqidlist"],
|
|
"Restrict search of database to list of SeqID's.\n\n"
|
|
"Incompatible with: gilist, negative_gilist, remote, subject, "
|
|
"subject_loc",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-negative_seqidlist", "negative_seqidlist"],
|
|
"Restrict search of database to everything except listed SeqID's.\n\n"
|
|
"Incompatible with: gilist, seqidlist, remote, subject, subject_loc",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-entrez_query", "entrez_query"],
|
|
"Restrict search with the given Entrez query (requires remote).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-qcov_hsp_perc", "qcov_hsp_perc"],
|
|
"Percent query coverage per hsp (float, 0 to 100).\n\n"
|
|
"Added in BLAST+ 2.2.30.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-max_target_seqs", "max_target_seqs"],
|
|
"Maximum number of aligned sequences to keep (integer, at least one).",
|
|
equate=False,
|
|
),
|
|
# Statistical options
|
|
_Option(
|
|
["-dbsize", "dbsize"],
|
|
"Effective length of the database (integer).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-searchsp", "searchsp"],
|
|
"Effective length of the search space (integer).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-max_hsps_per_subject", "max_hsps_per_subject"],
|
|
"Override max number of HSPs per subject saved for ungapped searches "
|
|
"(integer).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-max_hsps", "max_hsps"],
|
|
"Set max number of HSPs saved per subject sequence\n\n"
|
|
"Ddefault 0 means no limit.",
|
|
equate=False,
|
|
),
|
|
_Switch(["-sum_statistics", "sum_statistics"], "Use sum statistics."),
|
|
# Is -sum_stats a BLAST+ bug, why not use -sum_statistics switch?
|
|
_Option(
|
|
["-sum_stats", "sum_stats"],
|
|
"Use sum statistics (boolean).\n\nAdded in BLAST+ 2.2.30.",
|
|
equate=False,
|
|
),
|
|
# Extension options
|
|
_Option(
|
|
["-xdrop_ungap", "xdrop_ungap"],
|
|
"X-dropoff value (in bits) for ungapped extensions (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-xdrop_gap", "xdrop_gap"],
|
|
"X-dropoff value (in bits) for preliminary gapped extensions (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-xdrop_gap_final", "xdrop_gap_final"],
|
|
"X-dropoff value (in bits) for final gapped alignment (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-window_size", "window_size"],
|
|
"Multiple hits window size, use 0 to specify 1-hit algorithm "
|
|
"(integer).",
|
|
equate=False,
|
|
),
|
|
# Search strategy options
|
|
_Option(
|
|
["-import_search_strategy", "import_search_strategy"],
|
|
"Search strategy to use.\n\n"
|
|
"Incompatible with: export_search_strategy",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-export_search_strategy", "export_search_strategy"],
|
|
"File name to record the search strategy used.\n\n"
|
|
"Incompatible with: import_search_strategy",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
# Miscellaneous options
|
|
_Option(
|
|
["-num_threads", "num_threads"],
|
|
"Number of threads to use in the BLAST search.\n\n"
|
|
"Integer, at least one. Default is one. Incompatible with: remote",
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-remote", "remote"],
|
|
"Execute search remotely?\n\n"
|
|
"Incompatible with: gilist, negative_gilist, subject_loc, "
|
|
"num_threads, ...",
|
|
),
|
|
]
|
|
try:
|
|
# Insert extra parameters - at the start just in case there
|
|
# are any arguments which must come last:
|
|
self.parameters = extra_parameters + self.parameters
|
|
except AttributeError:
|
|
# Should we raise an error? The subclass should have set this up!
|
|
self.parameters = extra_parameters
|
|
_NcbibaseblastCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate(self):
|
|
incompatibles = {
|
|
"remote": ["gilist", "negative_gilist", "num_threads"],
|
|
"import_search_strategy": ["export_search_strategy"],
|
|
"gilist": ["negative_gilist"],
|
|
"seqidlist": ["gilist", "negative_gilist", "remote"],
|
|
}
|
|
self._validate_incompatibilities(incompatibles)
|
|
if self.entrez_query and not self.remote:
|
|
raise ValueError("Option entrez_query requires remote option.")
|
|
AbstractCommandline._validate(self)
|
|
|
|
|
|
class _Ncbiblast2SeqCommandline(_NcbiblastCommandline):
|
|
"""Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
|
|
|
|
This is provided for subclassing, it deals with shared options
|
|
common to all the BLAST tools supporting two-sequence BLAST
|
|
(blastn, psiblast, etc) but not rpsblast or rpstblastn.
|
|
"""
|
|
|
|
def __init__(self, cmd=None, **kwargs):
|
|
assert cmd is not None
|
|
extra_parameters = [
|
|
# General search options:
|
|
_Option(
|
|
["-gapopen", "gapopen"], "Cost to open a gap (integer).", equate=False
|
|
),
|
|
_Option(
|
|
["-gapextend", "gapextend"],
|
|
"Cost to extend a gap (integer).",
|
|
equate=False,
|
|
),
|
|
# BLAST-2-Sequences options:
|
|
_Option(
|
|
["-subject", "subject"],
|
|
"Subject sequence(s) to search.\n\n"
|
|
"Incompatible with: db, gilist, seqidlist, negative_gilist, "
|
|
"negative_seqidlist, db_soft_mask, db_hard_mask\n\n"
|
|
"See also subject_loc.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-subject_loc", "subject_loc"],
|
|
"Location on the subject sequence (Format: start-stop).\n\n"
|
|
"Incompatible with: db, gilist, seqidlist, negative_gilist, "
|
|
"negative_seqidlist, db_soft_mask, db_hard_mask, remote.\n\n"
|
|
"See also subject.",
|
|
equate=False,
|
|
),
|
|
# Restrict search or results:
|
|
_Option(
|
|
["-culling_limit", "culling_limit"],
|
|
"Hit culling limit (integer).\n\n"
|
|
"If the query range of a hit is enveloped by that of at "
|
|
"least this many higher-scoring hits, delete the hit.\n\n"
|
|
"Incompatible with: best_hit_overhang, best_hit_score_edge.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-best_hit_overhang", "best_hit_overhang"],
|
|
"Best Hit algorithm overhang value (float, recommended value: 0.1)\n\n"
|
|
"Float between 0.0 and 0.5 inclusive. "
|
|
"Incompatible with: culling_limit.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-best_hit_score_edge", "best_hit_score_edge"],
|
|
"Best Hit algorithm score edge value (float).\n\n"
|
|
"Float between 0.0 and 0.5 inclusive. Recommended value: 0.1\n\n"
|
|
"Incompatible with: culling_limit.",
|
|
equate=False,
|
|
),
|
|
]
|
|
try:
|
|
# Insert extra parameters - at the start just in case there
|
|
# are any arguments which must come last:
|
|
self.parameters = extra_parameters + self.parameters
|
|
except AttributeError:
|
|
# Should we raise an error? The subclass should have set this up!
|
|
self.parameters = extra_parameters
|
|
_NcbiblastCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate(self):
|
|
incompatibles = {
|
|
"subject_loc": ["db", "gilist", "negative_gilist", "seqidlist", "remote"],
|
|
"culling_limit": ["best_hit_overhang", "best_hit_score_edge"],
|
|
"subject": ["db", "gilist", "negative_gilist", "seqidlist"],
|
|
}
|
|
self._validate_incompatibilities(incompatibles)
|
|
_NcbiblastCommandline._validate(self)
|
|
|
|
|
|
class _NcbiblastMain2SeqCommandline(_Ncbiblast2SeqCommandline):
|
|
"""Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
|
|
|
|
This is provided for subclassing, it deals with shared options
|
|
common to the main BLAST tools blastp, blastn, blastx, tblastx, tblastn
|
|
but not psiblast, rpsblast or rpstblastn.
|
|
"""
|
|
|
|
def __init__(self, cmd=None, **kwargs):
|
|
assert cmd is not None
|
|
extra_parameters = [
|
|
# Restrict search or results:
|
|
_Option(
|
|
["-db_soft_mask", "db_soft_mask"],
|
|
"Filtering algorithm for soft masking (integer).\n\n"
|
|
"Filtering algorithm ID to apply to BLAST database as soft masking. "
|
|
"Incompatible with: db_hard_mask, subject, subject_loc",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-db_hard_mask", "db_hard_mask"],
|
|
"Filtering algorithm for hard masking (integer).\n\n"
|
|
"Filtering algorithm ID to apply to BLAST database as hard masking. "
|
|
"Incompatible with: db_soft_mask, subject, subject_loc",
|
|
equate=False,
|
|
),
|
|
]
|
|
try:
|
|
# Insert extra parameters - at the start just in case there
|
|
# are any arguments which must come last:
|
|
self.parameters = extra_parameters + self.parameters
|
|
except AttributeError:
|
|
# Should we raise an error? The subclass should have set this up!
|
|
self.parameters = extra_parameters
|
|
_Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate(self):
|
|
incompatibles = {
|
|
"db_soft_mask": ["db_hard_mask", "subject", "subject_loc"],
|
|
"db_hard_mask": ["db_soft_mask", "subject", "subject_loc"],
|
|
}
|
|
self._validate_incompatibilities(incompatibles)
|
|
_Ncbiblast2SeqCommandline._validate(self)
|
|
|
|
|
|
class NcbiblastpCommandline(_NcbiblastMain2SeqCommandline):
|
|
"""Create a commandline for the NCBI BLAST+ program blastp (for proteins).
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old blastall tool with separate tools for each of the searches.
|
|
This wrapper therefore replaces BlastallCommandline with option -p blastp.
|
|
|
|
>>> from Bio.Blast.Applications import NcbiblastpCommandline
|
|
>>> cline = NcbiblastpCommandline(query="rosemary.pro", db="nr",
|
|
... evalue=0.001, remote=True, ungapped=True)
|
|
>>> cline
|
|
NcbiblastpCommandline(cmd='blastp', query='rosemary.pro', db='nr', evalue=0.001, remote=True, ungapped=True)
|
|
>>> print(cline)
|
|
blastp -query rosemary.pro -db nr -evalue 0.001 -remote -ungapped
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="blastp", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# General search options:
|
|
_Option(
|
|
["-task", "task"],
|
|
"Task to execute (string, blastp (default), blastp-fast or blastp-short).",
|
|
checker_function=lambda value: value
|
|
in ["blastp", "blastp-fast", "blastp-short"],
|
|
equate=False,
|
|
),
|
|
_Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."),
|
|
_Option(
|
|
["-threshold", "threshold"],
|
|
"Minimum score for words to be added to the BLAST lookup table (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-comp_based_stats", "comp_based_stats"],
|
|
"Use composition-based statistics (string, default 2, i.e. True).\n\n"
|
|
"0, F or f: no composition-based statistics\n\n"
|
|
"2, T or t, D or d : Composition-based score adjustment as in "
|
|
"Bioinformatics 21:902-911, 2005, conditioned on sequence "
|
|
"properties\n\n"
|
|
"Note that tblastn also supports values of 1 and 3.",
|
|
checker_function=lambda value: value in "0Ft2TtDd",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable\n'
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
# Extension options:
|
|
_Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
|
|
# Miscellaneous options:
|
|
_Switch(
|
|
["-use_sw_tback", "use_sw_tback"],
|
|
"Compute locally optimal Smith-Waterman alignments?",
|
|
),
|
|
]
|
|
_NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
|
|
class NcbiblastnCommandline(_NcbiblastMain2SeqCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program blastn (for nucleotides).
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old blastall tool with separate tools for each of the searches.
|
|
This wrapper therefore replaces BlastallCommandline with option -p blastn.
|
|
|
|
For example, to run a search against the "nt" nucleotide database using the
|
|
FASTA nucleotide file "m_code.fasta" as the query, with an expectation value
|
|
cut off of 0.001, saving the output to a file in XML format:
|
|
|
|
>>> from Bio.Blast.Applications import NcbiblastnCommandline
|
|
>>> cline = NcbiblastnCommandline(query="m_cold.fasta", db="nt", strand="plus",
|
|
... evalue=0.001, out="m_cold.xml", outfmt=5)
|
|
>>> cline
|
|
NcbiblastnCommandline(cmd='blastn', out='m_cold.xml', outfmt=5, query='m_cold.fasta', db='nt', evalue=0.001, strand='plus')
|
|
>>> print(cline)
|
|
blastn -out m_cold.xml -outfmt 5 -query m_cold.fasta -db nt -evalue 0.001 -strand plus
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="blastn", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# Input query options:
|
|
_Option(
|
|
["-strand", "strand"],
|
|
"Query strand(s) to search against database/subject.\n\n"
|
|
'Values allowed are "both" (default), "minus", "plus".',
|
|
checker_function=lambda value: value in ["both", "minus", "plus"],
|
|
equate=False,
|
|
),
|
|
# General search options:
|
|
_Option(
|
|
["-task", "task"],
|
|
"Task to execute (string, default 'megablast')\n\n"
|
|
"Allowed values 'blastn', 'blastn-short', 'dc-megablast', 'megablast' "
|
|
"(the default), or 'vecscreen'.",
|
|
checker_function=lambda value: value
|
|
in ["blastn", "blastn-short", "dc-megablast", "megablast", "vecscreen"],
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-penalty", "penalty"],
|
|
"Penalty for a nucleotide mismatch (integer, at most zero).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-reward", "reward"],
|
|
"Reward for a nucleotide match (integer, at least zero).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-use_index", "use_index"],
|
|
"Use MegaBLAST database index (Boolean, Default = False)",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-index_name", "index_name"],
|
|
"MegaBLAST database index name.",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-dust", "dust"],
|
|
"Filter query sequence with DUST (string).\n\n"
|
|
"Format: 'yes', 'level window linker', or 'no' to disable.\n\n"
|
|
"Default = '20 64 1'.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-filtering_db", "filtering_db"],
|
|
"BLAST database containing filtering elements (i.e. repeats).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-window_masker_taxid", "window_masker_taxid"],
|
|
"Enable WindowMasker filtering using a Taxonomic ID (integer).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-window_masker_db", "window_masker_db"],
|
|
"Enable WindowMasker filtering using this repeats database (string).",
|
|
equate=False,
|
|
),
|
|
# Restrict search or results:
|
|
_Option(
|
|
["-perc_identity", "perc_identity"],
|
|
"Percent identity (real, 0 to 100 inclusive).",
|
|
equate=False,
|
|
),
|
|
# Discontiguous MegaBLAST options
|
|
_Option(
|
|
["-template_type", "template_type"],
|
|
"Discontiguous MegaBLAST template type (string).\n\n"
|
|
"Allowed values: 'coding', 'coding_and_optimal' or 'optimal'.\n"
|
|
"Requires: template_length.",
|
|
checker_function=lambda value: value
|
|
in ["coding", "coding_and_optimal", "optimal"],
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-template_length", "template_length"],
|
|
"Discontiguous MegaBLAST template length (integer).\n\n"
|
|
"Allowed values: 16, 18, 21.\n\n"
|
|
"Requires: template_type.",
|
|
checker_function=lambda value: value in [16, 18, 21, "16", "18", "21"],
|
|
equate=False,
|
|
),
|
|
# Extension options:
|
|
_Switch(
|
|
["-no_greedy", "no_greedy"],
|
|
"Use non-greedy dynamic programming extension",
|
|
),
|
|
_Option(
|
|
["-min_raw_gapped_score", "min_raw_gapped_score"],
|
|
"Minimum raw gapped score to keep an alignment in the "
|
|
"preliminary gapped and traceback stages (integer).",
|
|
equate=False,
|
|
),
|
|
_Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
|
|
_Option(
|
|
["-off_diagonal_range", "off_diagonal_range"],
|
|
"Number of off-diagonals to search for the 2nd hit (integer).\n\n"
|
|
"Expects a positive integer, or 0 (default) to turn off."
|
|
"Added in BLAST 2.2.23+",
|
|
equate=False,
|
|
),
|
|
]
|
|
_NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate(self):
|
|
if (self.template_type and not self.template_length) or (
|
|
self.template_length and not self.template_type
|
|
):
|
|
raise ValueError(
|
|
"Options template_type and template_type require each other."
|
|
)
|
|
_NcbiblastMain2SeqCommandline._validate(self)
|
|
|
|
|
|
class NcbiblastxCommandline(_NcbiblastMain2SeqCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program blastx (nucleotide query, protein database).
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old blastall tool with separate tools for each of the searches.
|
|
This wrapper therefore replaces BlastallCommandline with option -p blastx.
|
|
|
|
>>> from Bio.Blast.Applications import NcbiblastxCommandline
|
|
>>> cline = NcbiblastxCommandline(query="m_cold.fasta", db="nr", evalue=0.001)
|
|
>>> cline
|
|
NcbiblastxCommandline(cmd='blastx', query='m_cold.fasta', db='nr', evalue=0.001)
|
|
>>> print(cline)
|
|
blastx -query m_cold.fasta -db nr -evalue 0.001
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="blastx", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# Input query options:
|
|
_Option(
|
|
["-task", "task"],
|
|
"Task to execute (string, blastx (default) or blastx-fast).",
|
|
checker_function=lambda value: value in ["blastx", "blastx-fast"],
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-strand", "strand"],
|
|
"Query strand(s) to search against database/subject.\n\n"
|
|
'Values allowed are "both" (default), "minus", "plus".',
|
|
checker_function=lambda value: value in ["both", "minus", "plus"],
|
|
equate=False,
|
|
),
|
|
# Input query options:
|
|
_Option(
|
|
["-query_gencode", "query_gencode"],
|
|
"Genetic code to use to translate query (integer, default 1).",
|
|
equate=False,
|
|
),
|
|
# General search options:
|
|
_Option(
|
|
["-frame_shift_penalty", "frame_shift_penalty"],
|
|
"Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n"
|
|
"This was removed in BLAST 2.2.27+",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-max_intron_length", "max_intron_length"],
|
|
"Maximum intron length (integer).\n\n"
|
|
"Length of the largest intron allowed in a translated nucleotide "
|
|
"sequence when linking multiple distinct alignments (a negative "
|
|
"value disables linking). Default zero.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-matrix", "matrix"],
|
|
"Scoring matrix name (default BLOSUM62).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-threshold", "threshold"],
|
|
"Minimum score for words to be added to the BLAST lookup table (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-comp_based_stats", "comp_based_stats"],
|
|
"Use composition-based statistics for blastp, blastx, or tblastn.\n\n"
|
|
"D or d: default (equivalent to 2 )\n\n"
|
|
"0 or F or f: no composition-based statistics\n\n"
|
|
"1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n"
|
|
"2 or T or t : Composition-based score adjustment as in "
|
|
"Bioinformatics 21:902-911, 2005, conditioned on sequence "
|
|
"properties\n\n"
|
|
"3: Composition-based score adjustment as in Bioinformatics "
|
|
"21:902-911, 2005, unconditionally.\n\n"
|
|
"For programs other than tblastn, must either be absent or be "
|
|
"D, F or 0\n\n"
|
|
"Default = 2.",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable.'
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
# Extension options:
|
|
_Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
|
|
_Switch(
|
|
["-use_sw_tback", "use_sw_tback"],
|
|
"Compute locally optimal Smith-Waterman alignments?",
|
|
),
|
|
]
|
|
_NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
|
|
class NcbitblastnCommandline(_NcbiblastMain2SeqCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program tblastn.
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old blastall tool with separate tools for each of the searches.
|
|
This wrapper therefore replaces BlastallCommandline with option -p tblastn.
|
|
|
|
>>> from Bio.Blast.Applications import NcbitblastnCommandline
|
|
>>> cline = NcbitblastnCommandline(help=True)
|
|
>>> cline
|
|
NcbitblastnCommandline(cmd='tblastn', help=True)
|
|
>>> print(cline)
|
|
tblastn -help
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="tblastn", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# General search options:
|
|
_Option(
|
|
["-task", "task"],
|
|
"Task to execute (string, tblastn (default) or tblastn-fast).",
|
|
checker_function=lambda value: value in ["tblastn", "tblastn-fast"],
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-db_gencode", "db_gencode"],
|
|
"Genetic code to use to translate query (integer, default 1).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-frame_shift_penalty", "frame_shift_penalty"],
|
|
"Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n"
|
|
"This was removed in BLAST 2.2.27+",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-max_intron_length", "max_intron_length"],
|
|
"Maximum intron length (integer).\n\n"
|
|
"Length of the largest intron allowed in a translated nucleotide "
|
|
"sequence when linking multiple distinct alignments (a negative "
|
|
"value disables linking). Default zero.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-matrix", "matrix"],
|
|
"Scoring matrix name (default BLOSUM62).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-threshold", "threshold"],
|
|
"Minimum score for words to be added to the BLAST lookup table (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-comp_based_stats", "comp_based_stats"],
|
|
"Use composition-based statistics (string, default 2, i.e. True).\n\n"
|
|
"0, F or f: no composition-based statistics\n\n"
|
|
"1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n"
|
|
"2, T or t, D or d : Composition-based score adjustment as in "
|
|
"Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n"
|
|
"3: Composition-based score adjustment as in Bioinformatics 21:902-911, "
|
|
"2005, unconditionally\n\n"
|
|
"Note that only tblastn supports values of 1 and 3.",
|
|
checker_function=lambda value: value in "0Ft12TtDd3",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable.\n\n'
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
# Extension options:
|
|
_Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
|
|
# Miscellaneous options:
|
|
_Switch(
|
|
["-use_sw_tback", "use_sw_tback"],
|
|
"Compute locally optimal Smith-Waterman alignments?",
|
|
),
|
|
# PSI-TBLASTN options:
|
|
_Option(
|
|
["-in_pssm", "in_pssm"],
|
|
"PSI-BLAST checkpoint file.\n\nIncompatible with: remote, query",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
]
|
|
_NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
|
|
class NcbitblastxCommandline(_NcbiblastMain2SeqCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program tblastx.
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old blastall tool with separate tools for each of the searches.
|
|
This wrapper therefore replaces BlastallCommandline with option -p tblastx.
|
|
|
|
>>> from Bio.Blast.Applications import NcbitblastxCommandline
|
|
>>> cline = NcbitblastxCommandline(help=True)
|
|
>>> cline
|
|
NcbitblastxCommandline(cmd='tblastx', help=True)
|
|
>>> print(cline)
|
|
tblastx -help
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="tblastx", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# Input query options:
|
|
_Option(
|
|
["-strand", "strand"],
|
|
"Query strand(s) to search against database/subject.\n\n"
|
|
'Values allowed are "both" (default), "minus", "plus".',
|
|
checker_function=lambda value: value in ["both", "minus", "plus"],
|
|
equate=False,
|
|
),
|
|
# Input query options:
|
|
_Option(
|
|
["-query_gencode", "query_gencode"],
|
|
"Genetic code to use to translate query (integer, default 1).",
|
|
equate=False,
|
|
),
|
|
# General search options:
|
|
_Option(
|
|
["-db_gencode", "db_gencode"],
|
|
"Genetic code to use to translate query (integer, default 1).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-max_intron_length", "max_intron_length"],
|
|
"Maximum intron length (integer).\n\n"
|
|
"Length of the largest intron allowed in a translated nucleotide "
|
|
"sequence when linking multiple distinct alignments (a negative "
|
|
"value disables linking). Default zero.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-matrix", "matrix"],
|
|
"Scoring matrix name (default BLOSUM62).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-threshold", "threshold"],
|
|
"Minimum score for words to be added to the BLAST lookup table (float).",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable.\n\n'
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
]
|
|
_NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
|
|
class NcbipsiblastCommandline(_Ncbiblast2SeqCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program psiblast.
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old blastpgp tool with a similar tool psiblast. This wrapper
|
|
therefore replaces BlastpgpCommandline, the wrapper for blastpgp.
|
|
|
|
>>> from Bio.Blast.Applications import NcbipsiblastCommandline
|
|
>>> cline = NcbipsiblastCommandline(help=True)
|
|
>>> cline
|
|
NcbipsiblastCommandline(cmd='psiblast', help=True)
|
|
>>> print(cline)
|
|
psiblast -help
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="psiblast", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# General search options:
|
|
_Option(
|
|
["-matrix", "matrix"],
|
|
"Scoring matrix name (default BLOSUM62).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-threshold", "threshold"],
|
|
"Minimum score for words to be added to the BLAST lookup table (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-comp_based_stats", "comp_based_stats"],
|
|
"Use composition-based statistics (string, default 2, i.e. True).\n\n"
|
|
"0, F or f: no composition-based statistics\n\n"
|
|
"2, T or t, D or d : Composition-based score adjustment as in "
|
|
"Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n"
|
|
"Note that tblastn also supports values of 1 and 3.",
|
|
checker_function=lambda value: value in "0Ft2TtDd",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable. '
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
# Extension options:
|
|
_Option(
|
|
["-gap_trigger", "gap_trigger"],
|
|
"Number of bits to trigger gapping (float, default 22).",
|
|
equate=False,
|
|
),
|
|
# Miscellaneous options:
|
|
_Switch(
|
|
["-use_sw_tback", "use_sw_tback"],
|
|
"Compute locally optimal Smith-Waterman alignments?",
|
|
),
|
|
# PSI-BLAST options:
|
|
_Option(
|
|
["-num_iterations", "num_iterations"],
|
|
"Number of iterations to perform (integer, at least one).\n\n"
|
|
"Default is one. Incompatible with: remote",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-out_pssm", "out_pssm"],
|
|
"File name to store checkpoint file.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-out_ascii_pssm", "out_ascii_pssm"],
|
|
"File name to store ASCII version of PSSM.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-save_pssm_after_last_round", "save_pssm_after_last_round"],
|
|
"Save PSSM after the last database search.",
|
|
),
|
|
_Switch(
|
|
["-save_each_pssm", "save_each_pssm"],
|
|
"Save PSSM after each iteration\n\n"
|
|
"File name is given in -save_pssm or -save_ascii_pssm options.",
|
|
),
|
|
_Option(
|
|
["-in_msa", "in_msa"],
|
|
"File name of multiple sequence alignment to restart PSI-BLAST.\n\n"
|
|
"Incompatible with: in_pssm, query",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-msa_master_idx", "msa_master_idx"],
|
|
"Index of sequence to use as master in MSA.\n\n"
|
|
"Index (1-based) of sequence to use as the master in the multiple "
|
|
"sequence alignment. If not specified, the first sequence is used.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-in_pssm", "in_pssm"],
|
|
"PSI-BLAST checkpoint file.\n\n"
|
|
"Incompatible with: in_msa, query, phi_pattern",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
# PSSM engine options:
|
|
_Option(
|
|
["-pseudocount", "pseudocount"],
|
|
"Pseudo-count value used when constructing PSSM.\n\n"
|
|
"Integer. Default is zero.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-inclusion_ethresh", "inclusion_ethresh"],
|
|
"E-value inclusion threshold for pairwise alignments (float, default 0.002).",
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-ignore_msa_master", "ignore_msa_master"],
|
|
"Ignore the master sequence when creating PSSM.\n\n"
|
|
"Requires: in_msa\n"
|
|
"Incompatible with: msa_master_idx, in_pssm, query, query_loc, "
|
|
"phi_pattern",
|
|
),
|
|
# PHI-BLAST options:
|
|
_Option(
|
|
["-phi_pattern", "phi_pattern"],
|
|
"File name containing pattern to search.\n\n"
|
|
"Incompatible with: in_pssm",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
]
|
|
_Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate(self):
|
|
incompatibles = {
|
|
"num_iterations": ["remote"],
|
|
"in_msa": ["in_pssm", "query"],
|
|
"in_pssm": ["in_msa", "query", "phi_pattern"],
|
|
"ignore_msa_master": [
|
|
"msa_master_idx",
|
|
"in_pssm",
|
|
"query",
|
|
"query_loc",
|
|
"phi_pattern",
|
|
],
|
|
}
|
|
self._validate_incompatibilities(incompatibles)
|
|
_Ncbiblast2SeqCommandline._validate(self)
|
|
|
|
|
|
class NcbirpsblastCommandline(_NcbiblastCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program rpsblast.
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old rpsblast tool with a similar tool of the same name. This
|
|
wrapper replaces RpsBlastCommandline, the wrapper for the old rpsblast.
|
|
|
|
>>> from Bio.Blast.Applications import NcbirpsblastCommandline
|
|
>>> cline = NcbirpsblastCommandline(help=True)
|
|
>>> cline
|
|
NcbirpsblastCommandline(cmd='rpsblast', help=True)
|
|
>>> print(cline)
|
|
rpsblast -help
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="rpsblast", **kwargs):
|
|
"""Initialize the class."""
|
|
# TODO - remove the -word_size argument as per BLAST+ 2.2.30
|
|
# (BLAST team say it should never have been included, since
|
|
# the word size is set when building the domain database.)
|
|
# This likely means reviewing the class hierarchy again.
|
|
self.parameters = [
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable.'
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
# Restrict search or results:
|
|
_Option(
|
|
["-culling_limit", "culling_limit"],
|
|
"Hit culling limit (integer).\n\n"
|
|
"If the query range of a hit is enveloped by that of at "
|
|
"least this many higher-scoring hits, delete the hit. "
|
|
"Incompatible with: best_hit_overhang, best_hit_score_edge.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-best_hit_overhang", "best_hit_overhang"],
|
|
"Best Hit algorithm overhang value (recommended value: 0.1).\n\n"
|
|
"Float between 0.0 and 0.5 inclusive. "
|
|
"Incompatible with: culling_limit.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-best_hit_score_edge", "best_hit_score_edge"],
|
|
"Best Hit algorithm score edge value (recommended value: 0.1).\n\n"
|
|
"Float between 0.0 and 0.5 inclusive. "
|
|
"Incompatible with: culling_limit.",
|
|
equate=False,
|
|
),
|
|
# General search options:
|
|
_Option(
|
|
["-comp_based_stats", "comp_based_stats"],
|
|
"Use composition-based statistics.\n\n"
|
|
"D or d: default (equivalent to 0)\n\n"
|
|
"0 or F or f: Simplified Composition-based statistics as in "
|
|
"Bioinformatics 15:1000-1011, 1999\n\n"
|
|
"1 or T or t: Composition-based statistics as in NAR 29:2994-3005, "
|
|
"2001\n\n"
|
|
"Default = 0.",
|
|
checker_function=lambda value: value in "Dd0Ff1Tt",
|
|
equate=False,
|
|
),
|
|
# Misc options:
|
|
_Switch(
|
|
["-use_sw_tback", "use_sw_tback"],
|
|
"Compute locally optimal Smith-Waterman alignments?",
|
|
),
|
|
]
|
|
_NcbiblastCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate(self):
|
|
incompatibles = {"culling_limit": ["best_hit_overhang", "best_hit_score_edge"]}
|
|
self._validate_incompatibilities(incompatibles)
|
|
_NcbiblastCommandline._validate(self)
|
|
|
|
|
|
class NcbirpstblastnCommandline(_NcbiblastCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program rpstblastn.
|
|
|
|
With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
|
|
replaced the old rpsblast tool with a similar tool of the same name, and a
|
|
separate tool rpstblastn for Translated Reverse Position Specific BLAST.
|
|
|
|
>>> from Bio.Blast.Applications import NcbirpstblastnCommandline
|
|
>>> cline = NcbirpstblastnCommandline(help=True)
|
|
>>> cline
|
|
NcbirpstblastnCommandline(cmd='rpstblastn', help=True)
|
|
>>> print(cline)
|
|
rpstblastn -help
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="rpstblastn", **kwargs):
|
|
"""Initialize the class."""
|
|
# TODO - remove the -word_size argument as per BLAST+ 2.2.30
|
|
# (BLAST team say it should never have been included, since
|
|
# the word size is set when building the domain database.)
|
|
# This likely means reviewing the class hierarchy again.
|
|
self.parameters = [
|
|
# Input query options:
|
|
_Option(
|
|
["-strand", "strand"],
|
|
"Query strand(s) to search against database/subject.\n\n"
|
|
'Values allowed are "both" (default), "minus", "plus".',
|
|
checker_function=lambda value: value in ["both", "minus", "plus"],
|
|
equate=False,
|
|
),
|
|
# Input query options:
|
|
_Option(
|
|
["-query_gencode", "query_gencode"],
|
|
"Genetic code to use to translate query (integer, default 1).",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable. '
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
# General search options:
|
|
_Option(
|
|
["-comp_based_stats", "comp_based_stats"],
|
|
"Use composition-based statistics.\n\n"
|
|
"D or d: default (equivalent to 0)\n\n"
|
|
"0 or F or f: Simplified Composition-based statistics as in "
|
|
"Bioinformatics 15:1000-1011, 1999\n\n"
|
|
"1 or T or t: Composition-based statistics as in NAR 29:2994-3005, "
|
|
"2001\n\n"
|
|
"Default = 0.",
|
|
checker_function=lambda value: value in "Dd0Ff1Tt",
|
|
equate=False,
|
|
),
|
|
# Extension options:
|
|
_Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
|
|
# Miscellaneous options:
|
|
_Switch(
|
|
["-use_sw_tback", "use_sw_tback"],
|
|
"Compute locally optimal Smith-Waterman alignments?",
|
|
),
|
|
]
|
|
_NcbiblastCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
|
|
class NcbiblastformatterCommandline(_NcbibaseblastCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program blast_formatter.
|
|
|
|
With the release of BLAST 2.2.24+ (i.e. the BLAST suite rewritten in C++
|
|
instead of C), the NCBI added the ASN.1 output format option to all the
|
|
search tools, and extended the blast_formatter to support this as input.
|
|
|
|
The blast_formatter command allows you to convert the ASN.1 output into
|
|
the other output formats (XML, tabular, plain text, HTML).
|
|
|
|
>>> from Bio.Blast.Applications import NcbiblastformatterCommandline
|
|
>>> cline = NcbiblastformatterCommandline(archive="example.asn", outfmt=5, out="example.xml")
|
|
>>> cline
|
|
NcbiblastformatterCommandline(cmd='blast_formatter', out='example.xml', outfmt=5, archive='example.asn')
|
|
>>> print(cline)
|
|
blast_formatter -out example.xml -outfmt 5 -archive example.asn
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
|
|
Note that this wrapper is for the version of blast_formatter from BLAST
|
|
2.2.24+ (or later) which is when the NCBI first announced the inclusion
|
|
this tool. There was actually an early version in BLAST 2.2.23+ (and
|
|
possibly in older releases) but this did not have the -archive option
|
|
(instead -rid is a mandatory argument), and is not supported by this
|
|
wrapper.
|
|
"""
|
|
|
|
def __init__(self, cmd="blast_formatter", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# Input options
|
|
_Option(
|
|
["-rid", "rid"],
|
|
"BLAST Request ID (RID), not compatible with archive arg.",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-archive", "archive"],
|
|
"Archive file of results, not compatible with rid arg.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
# Restrict search or results
|
|
_Option(
|
|
["-max_target_seqs", "max_target_seqs"],
|
|
"Maximum number of aligned sequences to keep.",
|
|
checker_function=lambda value: value >= 1,
|
|
equate=False,
|
|
),
|
|
]
|
|
_NcbibaseblastCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _validate(self):
|
|
incompatibles = {"rid": ["archive"]}
|
|
self._validate_incompatibilities(incompatibles)
|
|
_NcbibaseblastCommandline._validate(self)
|
|
|
|
|
|
class NcbideltablastCommandline(_Ncbiblast2SeqCommandline):
|
|
"""Create a commandline for the NCBI BLAST+ program deltablast (for proteins).
|
|
|
|
This is a wrapper for the deltablast command line command included in
|
|
the NCBI BLAST+ software (not present in the original BLAST).
|
|
|
|
>>> from Bio.Blast.Applications import NcbideltablastCommandline
|
|
>>> cline = NcbideltablastCommandline(query="rosemary.pro", db="nr",
|
|
... evalue=0.001, remote=True)
|
|
>>> cline
|
|
NcbideltablastCommandline(cmd='deltablast', query='rosemary.pro', db='nr', evalue=0.001, remote=True)
|
|
>>> print(cline)
|
|
deltablast -query rosemary.pro -db nr -evalue 0.001 -remote
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="deltablast", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# General search options:
|
|
_Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."),
|
|
_Option(
|
|
["-threshold", "threshold"],
|
|
"Minimum score for words to be added to the BLAST lookup table (float).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-comp_based_stats", "comp_based_stats"],
|
|
"Use composition-based statistics (string, default 2, i.e. True).\n\n"
|
|
"0, F or f: no composition-based statistics.\n\n"
|
|
"2, T or t, D or d : Composition-based score adjustment as in "
|
|
"Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n"
|
|
"Note that tblastn also supports values of 1 and 3.",
|
|
checker_function=lambda value: value in "0Ft2TtDd",
|
|
equate=False,
|
|
),
|
|
# Query filtering options:
|
|
_Option(
|
|
["-seg", "seg"],
|
|
"Filter query sequence with SEG (string).\n\n"
|
|
'Format: "yes", "window locut hicut", or "no" to disable. '
|
|
'Default is "12 2.2 2.5"',
|
|
equate=False,
|
|
),
|
|
# Extension options:
|
|
_Option(
|
|
["-gap_trigger", "gap_trigger"],
|
|
"Number of bits to trigger gapping. Default = 22.",
|
|
equate=False,
|
|
),
|
|
# Miscellaneous options:
|
|
_Switch(
|
|
["-use_sw_tback", "use_sw_tback"],
|
|
"Compute locally optimal Smith-Waterman alignments?",
|
|
),
|
|
# PSI-BLAST options
|
|
_Option(
|
|
["-num_iterations", "num_iterations"],
|
|
"Number of iterations to perform. (integer >=1, Default is 1).\n\n"
|
|
"Incompatible with: remote",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-out_pssm", "out_pssm"],
|
|
"File name to store checkpoint file.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-out_ascii_pssm", "out_ascii_pssm"],
|
|
"File name to store ASCII version of PSSM.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-save_pssm_after_last_round", "save_pssm_after_last_round"],
|
|
"Save PSSM after the last database search.",
|
|
),
|
|
_Switch(
|
|
["-save_each_pssm", "save_each_pssm"],
|
|
"Save PSSM after each iteration.\n\n"
|
|
"File name is given in -save_pssm or -save_ascii_pssm options.",
|
|
),
|
|
# PSSM engine options
|
|
_Option(
|
|
["-pseudocount", "pseudocount"],
|
|
"Pseudo-count value used when constructing PSSM (integer, default 0).",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-domain_inclusion_ethresh", "domain_inclusion_ethresh"],
|
|
"E-value inclusion threshold for alignments with conserved domains.\n\n"
|
|
"(float, Default is 0.05)",
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-inclusion_ethresh", "inclusion_ethresh"],
|
|
"Pairwise alignment e-value inclusion threshold (float, default 0.002).",
|
|
equate=False,
|
|
),
|
|
# DELTA-BLAST options
|
|
_Option(
|
|
["-rpsdb", "rpsdb"],
|
|
"BLAST domain database name (dtring, Default = 'cdd_delta').",
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-show_domain_hits", "show_domain_hits"],
|
|
"Show domain hits?\n\nIncompatible with: remote, subject",
|
|
),
|
|
]
|
|
_Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
|
|
class NcbimakeblastdbCommandline(AbstractCommandline):
|
|
"""Wrapper for the NCBI BLAST+ program makeblastdb.
|
|
|
|
This is a wrapper for the NCBI BLAST+ makeblastdb application
|
|
to create BLAST databases. By default, this creates a blast database
|
|
with the same name as the input file. The default output location
|
|
is the same directory as the input.
|
|
|
|
>>> from Bio.Blast.Applications import NcbimakeblastdbCommandline
|
|
>>> cline = NcbimakeblastdbCommandline(dbtype="prot",
|
|
... input_file="NC_005816.faa")
|
|
>>> cline
|
|
NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file='NC_005816.faa')
|
|
>>> print(cline)
|
|
makeblastdb -dbtype prot -in NC_005816.faa
|
|
|
|
You would typically run the command line with cline() or via the Python
|
|
subprocess module, as described in the Biopython tutorial.
|
|
"""
|
|
|
|
def __init__(self, cmd="makeblastdb", **kwargs):
|
|
"""Initialize the class."""
|
|
self.parameters = [
|
|
# Basic input options
|
|
_Switch(
|
|
["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments."
|
|
),
|
|
_Switch(
|
|
["-help", "help"],
|
|
"Print USAGE, DESCRIPTION and ARGUMENTS description; "
|
|
"ignore other arguments.",
|
|
),
|
|
_Switch(
|
|
["-version", "version"],
|
|
"Print version number; ignore other arguments.",
|
|
),
|
|
# Output configuration options
|
|
_Option(
|
|
["-out", "out"],
|
|
"Output file for alignment.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
# makeblastdb specific options
|
|
_Option(
|
|
["-blastdb_version", "blastdb_version"],
|
|
"Version of BLAST database to be created. "
|
|
"Tip: use BLAST database version 4 on 32 bit CPU. "
|
|
"Default = 5",
|
|
equate=False,
|
|
checker_function=lambda x: x == 4 or x == 5,
|
|
),
|
|
_Option(
|
|
["-dbtype", "dbtype"],
|
|
"Molecule type of target db ('nucl' or 'prot').",
|
|
equate=False,
|
|
is_required=True,
|
|
checker_function=lambda x: x == "nucl" or x == "prot",
|
|
),
|
|
_Option(
|
|
["-in", "input_file"],
|
|
"Input file/database name.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-input_type", "input_type"],
|
|
"Type of the data specified in input_file.\n\n"
|
|
"Default = 'fasta'. Added in BLAST 2.2.26.",
|
|
filename=False,
|
|
equate=False,
|
|
checker_function=self._input_type_checker,
|
|
),
|
|
_Option(
|
|
["-title", "title"],
|
|
"Title for BLAST database.",
|
|
filename=False,
|
|
equate=False,
|
|
),
|
|
_Switch(
|
|
["-parse_seqids", "parse_seqids"],
|
|
"Option to parse seqid for FASTA input if set.\n\n"
|
|
"For all other input types, seqids are parsed automatically",
|
|
),
|
|
_Switch(
|
|
["-hash_index", "hash_index"], "Create index of sequence hash values."
|
|
),
|
|
_Option(
|
|
["-mask_data", "mask_data"],
|
|
"Comma-separated list of input files containing masking "
|
|
"data as produced by NCBI masking applications "
|
|
"(e.g. dustmasker, segmasker, windowmasker).",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-mask_id", "mask_id"],
|
|
"Comma-separated list of strings to uniquely identify the "
|
|
"masking algorithm.",
|
|
filename=False,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-mask_desc", "mask_desc"],
|
|
"Comma-separated list of free form strings to describe "
|
|
"the masking algorithm details.",
|
|
filename=False,
|
|
equate=False,
|
|
),
|
|
_Switch(["-gi_mask", "gi_mask"], "Create GI indexed masking data."),
|
|
_Option(
|
|
["-gi_mask_name", "gi_mask_name"],
|
|
"Comma-separated list of masking data output files.",
|
|
filename=False,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-max_file_sz", "max_file_sz"],
|
|
"Maximum file size for BLAST database files. Default = '1GB'.",
|
|
filename=False,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-logfile", "logfile"],
|
|
"File to which the program log should be redirected.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
_Option(
|
|
["-taxid", "taxid"],
|
|
"Taxonomy ID to assign to all sequences.",
|
|
filename=False,
|
|
equate=False,
|
|
checker_function=lambda x: type(x)(int(x)) == x,
|
|
),
|
|
_Option(
|
|
["-taxid_map", "taxid_map"],
|
|
"Text file mapping sequence IDs to taxonomy IDs.\n\n"
|
|
"Format:<SequenceId> <TaxonomyId><newline>",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
]
|
|
AbstractCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
def _input_type_checker(self, command):
|
|
return command in ("asn1_bin", "asn1_txt", "blastdb", "fasta")
|
|
|
|
def _validate(self):
|
|
incompatibles = {
|
|
"mask_id": ["gi_mask"],
|
|
"gi_mask": ["mask_id"],
|
|
"taxid": ["taxid_map"],
|
|
}
|
|
|
|
# Copied from _NcbibaseblastCommandline class above.
|
|
# Code repeated here for python2 and 3 compatibility,
|
|
# because this is not a _NcbibaseblastCommandline subclass.
|
|
for a in incompatibles:
|
|
if self._get_parameter(a):
|
|
for b in incompatibles[a]:
|
|
if self._get_parameter(b):
|
|
raise ValueError(f"Options {a} and {b} are incompatible.")
|
|
|
|
if self.mask_id and not self.mask_data:
|
|
raise ValueError("Option mask_id requires mask_data to be set.")
|
|
if self.mask_desc and not self.mask_id:
|
|
raise ValueError("Option mask_desc requires mask_id to be set.")
|
|
if self.gi_mask and not self.parse_seqids:
|
|
raise ValueError("Option gi_mask requires parse_seqids to be set.")
|
|
if self.gi_mask_name and not (self.mask_data and self.gi_mask):
|
|
raise ValueError(
|
|
"Option gi_mask_name requires mask_data and gi_mask to be set."
|
|
)
|
|
if self.taxid_map and not self.parse_seqids:
|
|
raise ValueError("Option taxid_map requires parse_seqids to be set.")
|
|
AbstractCommandline._validate(self)
|
|
|
|
|
|
def _test():
|
|
"""Run the Bio.Blast.Applications module's doctests (PRIVATE)."""
|
|
import doctest
|
|
|
|
doctest.testmod(verbose=1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run the doctests
|
|
_test()
|