New Blast parser and qblast (#4505)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* broken

* update

* update

* update

* update

* update

---------

Co-authored-by: Michiel de Hoon <mdehoon@tkx249.genome.gsc.riken.jp>
This commit is contained in:
mdehoon
2024-01-02 09:45:48 +09:00
committed by GitHub
parent 48f3e303d4
commit 307c7aef2d
14 changed files with 14974 additions and 487 deletions

View File

@ -1,3 +1,4 @@
# Copyright 1999-2000 by Jeffrey Chang. All rights reserved.
# Copyright 2000 by Bertrand Frottier. All rights reserved.
# Revisions 2005-2006 copyright Michiel de Hoon
# Revisions 2006-2009 copyright Peter Cock
@ -8,14 +9,468 @@
# package.
"""Code to work with the BLAST XML output.
The BLAST XML DTD file is on the NCBI FTP site at:
ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
"""
The BLAST XML DTD file is available on the NCBI site at:
https://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd
Record classes to hold BLAST output are:
Classes:
Blast Holds all the information from a blast search.
PSIBlast Holds all the information from a psi-blast search.
Header Holds information from the header.
Description Holds information about one hit description.
Alignment Holds information about one alignment hit.
HSP Holds information about one HSP.
MultipleAlignment Holds information about a multiple alignment.
DatabaseReport Holds information from the database report.
Parameters Holds information from the parameters.
"""
# XXX finish printable BLAST output
from Bio.Blast import Record
import xml.sax
from xml.sax.handler import ContentHandler
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
def fmt_(value, format_spec="%s", default_str="<unknown>"):
"""Ensure the given value formats to a string correctly."""
if value is None:
return default_str
return format_spec % value
class Header:
"""Saves information from a blast header.
Members:
application The name of the BLAST flavor that generated this data.
version Version of blast used.
date Date this data was generated.
reference Reference for blast.
query Name of query sequence.
query_letters Number of letters in the query sequence. (int)
database Name of the database.
database_sequences Number of sequences in the database. (int)
database_letters Number of letters in the database. (int)
"""
def __init__(self):
"""Initialize the class."""
self.application = ""
self.version = ""
self.date = ""
self.reference = ""
self.query = ""
self.query_letters = None
self.database = ""
self.database_sequences = None
self.database_letters = None
class Description:
"""Stores information about one hit in the descriptions section.
Members:
title Title of the hit.
score Number of bits. (int)
bits Bit score. (float)
e E value. (float)
num_alignments Number of alignments for the same subject. (int)
"""
def __init__(self):
"""Initialize the class."""
self.title = ""
self.score = None
self.bits = None
self.e = None
self.num_alignments = None
def __str__(self):
"""Return the description as a string."""
return f"{self.title:<66} {self.score:>5} {self.e}"
class DescriptionExt(Description):
"""Extended description record for BLASTXML version 2.
Members:
items List of DescriptionExtItem
"""
def __init__(self):
"""Initialize the class."""
super().__init__()
self.items = []
def append_item(self, item):
"""Add a description extended record."""
if len(self.items) == 0:
self.title = str(item)
self.items.append(item)
class DescriptionExtItem:
"""Stores information about one record in hit description for BLASTXML version 2.
Members:
id Database identifier
title Title of the hit.
"""
def __init__(self):
"""Initialize the class."""
self.id = None
self.title = None
self.accession = None
self.taxid = None
self.sciname = None
def __str__(self):
"""Return the description identifier and title as a string."""
return f"{self.id} {self.title}"
class Alignment:
"""Stores information about one hit in the alignments section.
Members:
title Name.
hit_id Hit identifier. (str)
hit_def Hit definition. (str)
length Length. (int)
hsps A list of HSP objects.
"""
def __init__(self):
"""Initialize the class."""
self.title = ""
self.hit_id = ""
self.hit_def = ""
self.length = None
self.hsps = []
def __str__(self):
"""Return the BLAST alignment as a formatted string."""
lines = self.title.split("\n")
lines.append(f"Length = {self.length}\n")
return "\n ".join(lines)
class HSP:
"""Stores information about one hsp in an alignment hit.
Members:
- score BLAST score of hit. (float)
- bits Number of bits for that score. (float)
- expect Expect value. (float)
- num_alignments Number of alignments for same subject. (int)
- identities Number of identities (int) if using the XML parser.
Tuple of number of identities/total aligned (int, int)
if using the (obsolete) plain text parser.
- positives Number of positives (int) if using the XML parser.
Tuple of number of positives/total aligned (int, int)
if using the (obsolete) plain text parser.
- gaps Number of gaps (int) if using the XML parser.
Tuple of number of gaps/total aligned (int, int) if
using the (obsolete) plain text parser.
- align_length Length of the alignment. (int)
- strand Tuple of (query, target) strand.
- frame Tuple of 1 or 2 frame shifts, depending on the flavor.
- query The query sequence.
- query_start The start residue for the query sequence. (1-based)
- query_end The end residue for the query sequence. (1-based)
- match The match sequence.
- sbjct The sbjct sequence.
- sbjct_start The start residue for the sbjct sequence. (1-based)
- sbjct_end The end residue for the sbjct sequence. (1-based)
Not all flavors of BLAST return values for every attribute::
score expect identities positives strand frame
BLASTP X X X X
BLASTN X X X X X
BLASTX X X X X X
TBLASTN X X X X X
TBLASTX X X X X X/X
Note: for BLASTX, the query sequence is shown as a protein sequence,
but the numbering is based on the nucleotides. Thus, the numbering
is 3x larger than the number of amino acid residues. A similar effect
can be seen for the sbjct sequence in TBLASTN, and for both sequences
in TBLASTX.
Also, for negative frames, the sequence numbering starts from
query_start and counts down.
"""
def __init__(self):
"""Initialize the class."""
self.score = None
self.bits = None
self.expect = None
self.num_alignments = None
self.identities = (None, None)
self.positives = (None, None)
self.gaps = (None, None)
self.align_length = None
self.strand = (None, None)
self.frame = ()
self.query = ""
self.query_start = None
self.query_end = None
self.match = ""
self.sbjct = ""
self.sbjct_start = None
self.sbjct_end = None
def __str__(self):
"""Return the BLAST HSP as a formatted string."""
lines = [
"Score %s (%s bits), expectation %s, alignment length %s"
% (
fmt_(self.score, "%i"),
fmt_(self.bits, "%i"),
fmt_(self.expect, "%0.1e"),
fmt_(self.align_length, "%i"),
)
]
if self.align_length is None:
return "\n".join(lines)
if self.align_length < 50:
lines.append(
"Query:%8s %s %s" % (self.query_start, self.query, self.query_end)
)
lines.append(f" {self.match}")
lines.append(
"Sbjct:%8s %s %s" % (self.sbjct_start, self.sbjct, self.sbjct_end)
)
else:
lines.append(
"Query:%8s %s...%s %s"
% (self.query_start, self.query[:45], self.query[-3:], self.query_end)
)
lines.append(f" {self.match[:45]}...{self.match[-3:]}")
lines.append(
"Sbjct:%8s %s...%s %s"
% (self.sbjct_start, self.sbjct[:45], self.sbjct[-3:], self.sbjct_end)
)
return "\n".join(lines)
class MultipleAlignment:
"""Holds information about a multiple alignment.
Members:
alignment A list of tuples (name, start residue, sequence, end residue).
The start residue is 1-based. It may be blank, if that sequence is
not aligned in the multiple alignment.
"""
def __init__(self):
"""Initialize the class."""
self.alignment = []
def to_generic(self):
"""Retrieve generic alignment object for the given alignment.
Instead of the tuples, this returns a MultipleSeqAlignment object
from Bio.Align, through which you can manipulate and query
the object.
Thanks to James Casbon for the code.
"""
seq_parts = []
seq_names = []
parse_number = 0
n = 0
for name, start, seq, end in self.alignment:
if name == "QUERY": # QUERY is the first in each alignment block
parse_number += 1
n = 0
if parse_number == 1: # create on first_parse, append on all others
seq_parts.append(seq)
seq_names.append(name)
else:
seq_parts[n] += seq
n += 1
records = (
SeqRecord(Seq(seq), name) for (name, seq) in zip(seq_names, seq_parts)
)
return MultipleSeqAlignment(records)
class Round:
"""Holds information from a PSI-BLAST round.
Members:
number Round number. (int)
reused_seqs Sequences in model, found again. List of Description objects.
new_seqs Sequences not found, or below threshold. List of Description.
alignments A list of Alignment objects.
multiple_alignment A MultipleAlignment object.
"""
def __init__(self):
"""Initialize the class."""
self.number = None
self.reused_seqs = []
self.new_seqs = []
self.alignments = []
self.multiple_alignment = None
class DatabaseReport:
"""Holds information about a database report.
Members:
database_name List of database names. (can have multiple dbs)
num_letters_in_database Number of letters in the database. (int)
num_sequences_in_database List of number of sequences in the database.
posted_date List of the dates the databases were posted.
ka_params A tuple of (lambda, k, h) values. (floats)
gapped # XXX this isn't set right!
ka_params_gap A tuple of (lambda, k, h) values. (floats)
"""
def __init__(self):
"""Initialize the class."""
self.database_name = []
self.posted_date = []
self.num_letters_in_database = []
self.num_sequences_in_database = []
self.ka_params = (None, None, None)
self.gapped = 0
self.ka_params_gap = (None, None, None)
class Parameters:
"""Holds information about the parameters.
Members:
matrix Name of the matrix.
gap_penalties Tuple of (open, extend) penalties. (floats)
sc_match Match score for nucleotide-nucleotide comparison
sc_mismatch Mismatch penalty for nucleotide-nucleotide comparison
num_hits Number of hits to the database. (int)
num_sequences Number of sequences. (int)
num_good_extends Number of extensions. (int)
num_seqs_better_e Number of sequences better than e-value. (int)
hsps_no_gap Number of HSP's better, without gapping. (int)
hsps_prelim_gapped Number of HSP's gapped in prelim test. (int)
hsps_prelim_gapped_attemped Number of HSP's attempted in prelim. (int)
hsps_gapped Total number of HSP's gapped. (int)
query_length Length of the query. (int)
query_id Identifier of the query sequence. (str)
database_length Number of letters in the database. (int)
effective_hsp_length Effective HSP length. (int)
effective_query_length Effective length of query. (int)
effective_database_length Effective length of database. (int)
effective_search_space Effective search space. (int)
effective_search_space_used Effective search space used. (int)
frameshift Frameshift window. Tuple of (int, float)
threshold Threshold. (int)
window_size Window size. (int)
dropoff_1st_pass Tuple of (score, bits). (int, float)
gap_x_dropoff Tuple of (score, bits). (int, float)
gap_x_dropoff_final Tuple of (score, bits). (int, float)
gap_trigger Tuple of (score, bits). (int, float)
blast_cutoff Tuple of (score, bits). (int, float)
"""
def __init__(self):
"""Initialize the class."""
self.matrix = ""
self.gap_penalties = (None, None)
self.sc_match = None
self.sc_mismatch = None
self.num_hits = None
self.num_sequences = None
self.num_good_extends = None
self.num_seqs_better_e = None
self.hsps_no_gap = None
self.hsps_prelim_gapped = None
self.hsps_prelim_gapped_attemped = None
self.hsps_gapped = None
self.query_id = None
self.query_length = None
self.database_length = None
self.effective_hsp_length = None
self.effective_query_length = None
self.effective_database_length = None
self.effective_search_space = None
self.effective_search_space_used = None
self.frameshift = (None, None)
self.threshold = None
self.window_size = None
self.dropoff_1st_pass = (None, None)
self.gap_x_dropoff = (None, None)
self.gap_x_dropoff_final = (None, None)
self.gap_trigger = (None, None)
self.blast_cutoff = (None, None)
# TODO - Add a friendly __str__ method to BLAST results
class Blast(Header, DatabaseReport, Parameters):
"""Saves the results from a blast search.
Members:
descriptions A list of Description objects.
alignments A list of Alignment objects.
multiple_alignment A MultipleAlignment object.
+ members inherited from base classes
"""
def __init__(self):
"""Initialize the class."""
Header.__init__(self)
DatabaseReport.__init__(self)
Parameters.__init__(self)
self.descriptions = []
self.alignments = []
self.multiple_alignment = None
class PSIBlast(Header, DatabaseReport, Parameters):
"""Saves the results from a blastpgp search.
Members:
rounds A list of Round objects.
converged Whether the search converged.
+ members inherited from base classes
"""
def __init__(self):
"""Initialize the class."""
Header.__init__(self)
DatabaseReport.__init__(self)
Parameters.__init__(self)
self.rounds = []
self.converged = 0
class _XMLparser(ContentHandler):
"""Generic SAX Parser (PRIVATE).
@ -124,7 +579,7 @@ class _XMLparser(ContentHandler):
class BlastParser(_XMLparser):
"""Parse XML BLAST data into a Record.Blast object.
"""Parse XML BLAST data into a Blast object.
Parses XML output from BLAST (direct use discouraged).
This (now) returns a list of Blast records.
@ -164,8 +619,8 @@ class BlastParser(_XMLparser):
def reset(self):
"""Reset all the data allowing reuse of the BlastParser() object."""
self._records = []
self._header = Record.Header()
self._parameters = Record.Parameters()
self._header = Header()
self._parameters = Parameters()
self._parameters.filter = None # Maybe I should update the class?
def _on_root_node(self, name):
@ -295,12 +750,12 @@ class BlastParser(_XMLparser):
def _start_blast_record(self):
"""Start interaction (PRIVATE)."""
self._blast = Record.Blast()
self._blast = Blast()
def _end_blast_record(self):
"""End interaction (PRIVATE)."""
# We stored a lot of generic "top level" information
# in self._header (an object of type Record.Header)
# in self._header (an object of type Header)
self._blast.reference = self._header.reference
self._blast.date = self._header.date
self._blast.version = self._header.version
@ -435,7 +890,7 @@ class BlastParser(_XMLparser):
# def _end_BlastOutput_query_seq(self):
# """The query sequence (PRIVATE)."""
# pass # XXX Missing in Record.Blast ?
# pass # XXX Missing in Blast ?
# def _end_BlastOutput_iter_num(self):
# """The psi-blast iteration number (PRIVATE)."""
@ -505,10 +960,8 @@ class BlastParser(_XMLparser):
# Hits
def _start_hit(self):
"""Start filling records (PRIVATE)."""
self._blast.alignments.append(Record.Alignment())
self._descr = (
Record.Description() if self._xml_version == 1 else Record.DescriptionExt()
)
self._blast.alignments.append(Alignment())
self._descr = Description() if self._xml_version == 1 else DescriptionExt()
self._blast.descriptions.append(self._descr)
self._blast.multiple_alignment = []
self._hit = self._blast.alignments[-1]
@ -552,11 +1005,11 @@ class BlastParser(_XMLparser):
def _start_hsp(self):
# Note that self._start_Hit() should have been called
# to setup things like self._blast.multiple_alignment
self._hsp = Record.HSP()
self._hsp = HSP()
self._hsp.positives = None
self._hit.hsps.append(self._hsp)
self._descr.num_alignments += 1
self._blast.multiple_alignment.append(Record.MultipleAlignment())
self._blast.multiple_alignment.append(MultipleAlignment())
self._mult_al = self._blast.multiple_alignment[-1]
def _end_hsp(self):
@ -703,7 +1156,7 @@ class BlastParser(_XMLparser):
def _start_hit_descr_item(self):
"""XML v2. Start hit description item."""
self._hit_descr_item = Record.DescriptionExtItem()
self._hit_descr_item = DescriptionExtItem()
def _end_hit_descr_item(self):
"""XML v2. Start hit description item."""

View File

@ -1,460 +0,0 @@
# Copyright 1999-2000 by Jeffrey Chang. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Record classes to hold BLAST output.
Classes:
Blast Holds all the information from a blast search.
PSIBlast Holds all the information from a psi-blast search.
Header Holds information from the header.
Description Holds information about one hit description.
Alignment Holds information about one alignment hit.
HSP Holds information about one HSP.
MultipleAlignment Holds information about a multiple alignment.
DatabaseReport Holds information from the database report.
Parameters Holds information from the parameters.
"""
# XXX finish printable BLAST output
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
def fmt_(value, format_spec="%s", default_str="<unknown>"):
"""Ensure the given value formats to a string correctly."""
if value is None:
return default_str
return format_spec % value
class Header:
"""Saves information from a blast header.
Members:
application The name of the BLAST flavor that generated this data.
version Version of blast used.
date Date this data was generated.
reference Reference for blast.
query Name of query sequence.
query_letters Number of letters in the query sequence. (int)
database Name of the database.
database_sequences Number of sequences in the database. (int)
database_letters Number of letters in the database. (int)
"""
def __init__(self):
"""Initialize the class."""
self.application = ""
self.version = ""
self.date = ""
self.reference = ""
self.query = ""
self.query_letters = None
self.database = ""
self.database_sequences = None
self.database_letters = None
class Description:
"""Stores information about one hit in the descriptions section.
Members:
title Title of the hit.
score Number of bits. (int)
bits Bit score. (float)
e E value. (float)
num_alignments Number of alignments for the same subject. (int)
"""
def __init__(self):
"""Initialize the class."""
self.title = ""
self.score = None
self.bits = None
self.e = None
self.num_alignments = None
def __str__(self):
"""Return the description as a string."""
return f"{self.title:<66} {self.score:>5} {self.e}"
class DescriptionExt(Description):
"""Extended description record for BLASTXML version 2.
Members:
items List of DescriptionExtItem
"""
def __init__(self):
"""Initialize the class."""
super().__init__()
self.items = []
def append_item(self, item):
"""Add a description extended record."""
if len(self.items) == 0:
self.title = str(item)
self.items.append(item)
class DescriptionExtItem:
"""Stores information about one record in hit description for BLASTXML version 2.
Members:
id Database identifier
title Title of the hit.
"""
def __init__(self):
"""Initialize the class."""
self.id = None
self.title = None
self.accession = None
self.taxid = None
self.sciname = None
def __str__(self):
"""Return the description identifier and title as a string."""
return f"{self.id} {self.title}"
class Alignment:
"""Stores information about one hit in the alignments section.
Members:
title Name.
hit_id Hit identifier. (str)
hit_def Hit definition. (str)
length Length. (int)
hsps A list of HSP objects.
"""
def __init__(self):
"""Initialize the class."""
self.title = ""
self.hit_id = ""
self.hit_def = ""
self.length = None
self.hsps = []
def __str__(self):
"""Return the BLAST alignment as a formatted string."""
lines = self.title.split("\n")
lines.append(f"Length = {self.length}\n")
return "\n ".join(lines)
class HSP:
"""Stores information about one hsp in an alignment hit.
Members:
- score BLAST score of hit. (float)
- bits Number of bits for that score. (float)
- expect Expect value. (float)
- num_alignments Number of alignments for same subject. (int)
- identities Number of identities (int) if using the XML parser.
Tuple of number of identities/total aligned (int, int)
if using the (obsolete) plain text parser.
- positives Number of positives (int) if using the XML parser.
Tuple of number of positives/total aligned (int, int)
if using the (obsolete) plain text parser.
- gaps Number of gaps (int) if using the XML parser.
Tuple of number of gaps/total aligned (int, int) if
using the (obsolete) plain text parser.
- align_length Length of the alignment. (int)
- strand Tuple of (query, target) strand.
- frame Tuple of 1 or 2 frame shifts, depending on the flavor.
- query The query sequence.
- query_start The start residue for the query sequence. (1-based)
- query_end The end residue for the query sequence. (1-based)
- match The match sequence.
- sbjct The sbjct sequence.
- sbjct_start The start residue for the sbjct sequence. (1-based)
- sbjct_end The end residue for the sbjct sequence. (1-based)
Not all flavors of BLAST return values for every attribute::
score expect identities positives strand frame
BLASTP X X X X
BLASTN X X X X X
BLASTX X X X X X
TBLASTN X X X X X
TBLASTX X X X X X/X
Note: for BLASTX, the query sequence is shown as a protein sequence,
but the numbering is based on the nucleotides. Thus, the numbering
is 3x larger than the number of amino acid residues. A similar effect
can be seen for the sbjct sequence in TBLASTN, and for both sequences
in TBLASTX.
Also, for negative frames, the sequence numbering starts from
query_start and counts down.
"""
def __init__(self):
"""Initialize the class."""
self.score = None
self.bits = None
self.expect = None
self.num_alignments = None
self.identities = (None, None)
self.positives = (None, None)
self.gaps = (None, None)
self.align_length = None
self.strand = (None, None)
self.frame = ()
self.query = ""
self.query_start = None
self.query_end = None
self.match = ""
self.sbjct = ""
self.sbjct_start = None
self.sbjct_end = None
def __str__(self):
"""Return the BLAST HSP as a formatted string."""
lines = [
"Score %s (%s bits), expectation %s, alignment length %s"
% (
fmt_(self.score, "%i"),
fmt_(self.bits, "%i"),
fmt_(self.expect, "%0.1e"),
fmt_(self.align_length, "%i"),
)
]
if self.align_length is None:
return "\n".join(lines)
if self.align_length < 50:
lines.append(
"Query:%8s %s %s" % (self.query_start, self.query, self.query_end)
)
lines.append(f" {self.match}")
lines.append(
"Sbjct:%8s %s %s" % (self.sbjct_start, self.sbjct, self.sbjct_end)
)
else:
lines.append(
"Query:%8s %s...%s %s"
% (self.query_start, self.query[:45], self.query[-3:], self.query_end)
)
lines.append(f" {self.match[:45]}...{self.match[-3:]}")
lines.append(
"Sbjct:%8s %s...%s %s"
% (self.sbjct_start, self.sbjct[:45], self.sbjct[-3:], self.sbjct_end)
)
return "\n".join(lines)
class MultipleAlignment:
"""Holds information about a multiple alignment.
Members:
alignment A list of tuples (name, start residue, sequence, end residue).
The start residue is 1-based. It may be blank, if that sequence is
not aligned in the multiple alignment.
"""
def __init__(self):
"""Initialize the class."""
self.alignment = []
def to_generic(self):
"""Retrieve generic alignment object for the given alignment.
Instead of the tuples, this returns a MultipleSeqAlignment object
from Bio.Align, through which you can manipulate and query
the object.
Thanks to James Casbon for the code.
"""
seq_parts = []
seq_names = []
parse_number = 0
n = 0
for name, start, seq, end in self.alignment:
if name == "QUERY": # QUERY is the first in each alignment block
parse_number += 1
n = 0
if parse_number == 1: # create on first_parse, append on all others
seq_parts.append(seq)
seq_names.append(name)
else:
seq_parts[n] += seq
n += 1
records = (
SeqRecord(Seq(seq), name) for (name, seq) in zip(seq_names, seq_parts)
)
return MultipleSeqAlignment(records)
class Round:
"""Holds information from a PSI-BLAST round.
Members:
number Round number. (int)
reused_seqs Sequences in model, found again. List of Description objects.
new_seqs Sequences not found, or below threshold. List of Description.
alignments A list of Alignment objects.
multiple_alignment A MultipleAlignment object.
"""
def __init__(self):
"""Initialize the class."""
self.number = None
self.reused_seqs = []
self.new_seqs = []
self.alignments = []
self.multiple_alignment = None
class DatabaseReport:
"""Holds information about a database report.
Members:
database_name List of database names. (can have multiple dbs)
num_letters_in_database Number of letters in the database. (int)
num_sequences_in_database List of number of sequences in the database.
posted_date List of the dates the databases were posted.
ka_params A tuple of (lambda, k, h) values. (floats)
gapped # XXX this isn't set right!
ka_params_gap A tuple of (lambda, k, h) values. (floats)
"""
def __init__(self):
"""Initialize the class."""
self.database_name = []
self.posted_date = []
self.num_letters_in_database = []
self.num_sequences_in_database = []
self.ka_params = (None, None, None)
self.gapped = 0
self.ka_params_gap = (None, None, None)
class Parameters:
"""Holds information about the parameters.
Members:
matrix Name of the matrix.
gap_penalties Tuple of (open, extend) penalties. (floats)
sc_match Match score for nucleotide-nucleotide comparison
sc_mismatch Mismatch penalty for nucleotide-nucleotide comparison
num_hits Number of hits to the database. (int)
num_sequences Number of sequences. (int)
num_good_extends Number of extensions. (int)
num_seqs_better_e Number of sequences better than e-value. (int)
hsps_no_gap Number of HSP's better, without gapping. (int)
hsps_prelim_gapped Number of HSP's gapped in prelim test. (int)
hsps_prelim_gapped_attemped Number of HSP's attempted in prelim. (int)
hsps_gapped Total number of HSP's gapped. (int)
query_length Length of the query. (int)
query_id Identifier of the query sequence. (str)
database_length Number of letters in the database. (int)
effective_hsp_length Effective HSP length. (int)
effective_query_length Effective length of query. (int)
effective_database_length Effective length of database. (int)
effective_search_space Effective search space. (int)
effective_search_space_used Effective search space used. (int)
frameshift Frameshift window. Tuple of (int, float)
threshold Threshold. (int)
window_size Window size. (int)
dropoff_1st_pass Tuple of (score, bits). (int, float)
gap_x_dropoff Tuple of (score, bits). (int, float)
gap_x_dropoff_final Tuple of (score, bits). (int, float)
gap_trigger Tuple of (score, bits). (int, float)
blast_cutoff Tuple of (score, bits). (int, float)
"""
def __init__(self):
"""Initialize the class."""
self.matrix = ""
self.gap_penalties = (None, None)
self.sc_match = None
self.sc_mismatch = None
self.num_hits = None
self.num_sequences = None
self.num_good_extends = None
self.num_seqs_better_e = None
self.hsps_no_gap = None
self.hsps_prelim_gapped = None
self.hsps_prelim_gapped_attemped = None
self.hsps_gapped = None
self.query_id = None
self.query_length = None
self.database_length = None
self.effective_hsp_length = None
self.effective_query_length = None
self.effective_database_length = None
self.effective_search_space = None
self.effective_search_space_used = None
self.frameshift = (None, None)
self.threshold = None
self.window_size = None
self.dropoff_1st_pass = (None, None)
self.gap_x_dropoff = (None, None)
self.gap_x_dropoff_final = (None, None)
self.gap_trigger = (None, None)
self.blast_cutoff = (None, None)
# TODO - Add a friendly __str__ method to BLAST results
class Blast(Header, DatabaseReport, Parameters):
"""Saves the results from a blast search.
Members:
descriptions A list of Description objects.
alignments A list of Alignment objects.
multiple_alignment A MultipleAlignment object.
+ members inherited from base classes
"""
def __init__(self):
"""Initialize the class."""
Header.__init__(self)
DatabaseReport.__init__(self)
Parameters.__init__(self)
self.descriptions = []
self.alignments = []
self.multiple_alignment = None
class PSIBlast(Header, DatabaseReport, Parameters):
"""Saves the results from a blastpgp search.
Members:
rounds A list of Round objects.
converged Whether the search converged.
+ members inherited from base classes
"""
def __init__(self):
"""Initialize the class."""
Header.__init__(self)
DatabaseReport.__init__(self)
Parameters.__init__(self)
self.rounds = []
self.converged = 0

View File

@ -1,7 +1,806 @@
# Copyright 1999 by Jeffrey Chang. All rights reserved.
# Revisions 2023 by Michiel de Hoon. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Code for dealing with BLAST programs and output."""
"""Code to parse and store BLAST XML output, and to invoke the NCBI BLAST web server.
This module provides code to parse and store BLAST XML output, following its
definition in the associated BLAST XML DTD file:
https://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd
This module also provides code to invoke the BLAST web server provided by NCBI.
https://blast.ncbi.nlm.nih.gov/
Variables:
- email Set the Blast email parameter (default is None).
- tool Set the Blast tool parameter (default is ``biopython``).
"""
import warnings
import time
from urllib.parse import urlencode
from urllib.request import build_opener, install_opener
from urllib.request import urlopen
from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler
from urllib.request import Request
from xml.parsers import expat
from Bio import BiopythonWarning
from Bio import StreamModeError
from Bio._utils import function_with_previous
email = None
tool = "biopython"
NCBI_BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
BLOCK = 2048 # default block size from expat
class NotXMLError(ValueError):
"""Failed to parse file as XML."""
def __init__(self, message):
"""Initialize the class."""
self.msg = message
def __str__(self):
"""Return a string summary of the exception."""
return (
"Failed to parse the XML data (%s). Please make sure that the input data "
"are in XML format." % self.msg
)
class CorruptedXMLError(ValueError):
"""Corrupted XML."""
def __init__(self, message):
"""Initialize the class."""
self.msg = message
def __str__(self):
"""Return a string summary of the exception."""
return (
"Failed to parse the XML data (%s). Please make sure that the input data "
"are not corrupted." % self.msg
)
class Record(list):
"""Stores the BLAST results for a single query.
A ``Bio.Blast.Record`` object is a list of ``Bio.Align.Alignments`` objects,
each corresponding to one hit for the query in the BLAST output.
The ``Bio.Blast.Record`` object may have the following attributes:
- query: A ``SeqRecord`` object which may contain some or all of the
following information:
- query.id: SeqId of query;
- query.description: Definition line of query;
- len(query.seq): Length of the query sequence.
- stat: A dictionary with summary statistics of the BLAST run. It may
contain the following keys:
- 'db-num': number of sequences in BLAST db (integer);
- 'db-len': length of BLAST db (integer);
- 'hsp-len': effective HSP length (integer);
- 'eff-space': effective search space (float);
- 'kappa': Karlin-Altschul parameter K (float);
- 'lambda': Karlin-Altschul parameter Lambda (float);
- 'entropy': Karlin-Altschul parameter H (float).
- message: Some (error?) information.
Each ``Bio.Align.Alignments`` object has a ``target`` attribute containing
the following information:
- target.id: seqId of subject;
- target.description: definition line of subject;
- target.name: accession of subject;
- len(target.seq): sequence length of subject.
The ``Bio.Align.Alignments`` class inherits from a list storing
``Bio.Align.Alignment`` objects. The ``target`` and ``query`` attributes
of a ``Bio.Align.Alignment`` object point to a ``SeqRecord`` object
representing the target and query, respectively. For translated BLAST
searches, The ``features`` attribute of the target or query may contain a
``SeqFeature`` of type CDS that stores the amino acid sequence region. The
``qualifiers`` attribute of such a feature is a dictionary with a single
key 'coded_by'; the corresponding value specifies the nucleotide sequence
region, in a GenBank-style string with 1-based coordinates, that encodes
the amino acid sequence.
Each ``Bio.Align.Alignment`` object has the following additional attributes:
- score: score of HSP;
- annotations: a dictionary that may contain the following keys:
- 'bit score': score (in bits) of HSP (float);
- 'evalue': e-value of HSP (float);
- 'identity': number of identities in HSP (integer);
- 'positive': number of positives in HSP (integer);
- 'gaps': number of gaps in HSP (integer);
- 'midline': formating middle line.
>>> from Bio import Blast
>>> record = Blast.read("Blast/xml_2212L_blastx_001.xml")
>>> record.query
SeqRecord(seq=Seq(None, length=556), id='gi|1347369|gb|G25137.1|G25137', name='<unknown name>', description='human STS EST48004, sequence tagged site', dbxrefs=[])
>>> record.stat
{'db-num': 2934173, 'db-len': 1011751523, 'hsp-len': 0, 'eff-space': 0.0, 'kappa': 0.041, 'lambda': 0.267, 'entropy': 0.14}
>>> len(record)
78
>>> alignments = record[0]
>>> type(alignments)
<class 'Bio.Align.Alignments'>
>>> alignments.target
SeqRecord(seq=Seq(None, length=319), id='gi|12654095|gb|AAH00859.1|', name='AAH00859', description='Unknown (protein for IMAGE:3459481) [Homo sapiens]', dbxrefs=[])
Most alignments consist of only 1 or a few Alignment objects:
>>> len(alignments)
1
>>> alignment = alignments[0]
>>> type(alignment)
<class 'Bio.Align.Alignment'>
>>> alignment.score
630.0
>>> alignment.annotations
{'bit score': 247.284, 'evalue': 1.69599e-64, 'identity': 122, 'positive': 123, 'gaps': 0, 'midline': 'DLQLLIKAVNLFPAGTNSRWEVIANYMNIHSSSGVKRTAKDVIGKAKSLQKLDPHQKDDINKKAFDKFKKEHGVVPQADNATPSERF GPYTDFTP TTE QKL EQAL TYPVNT ERW IA AVPGR K+'}
Target and query information are stored in the respective attributes of the
alignment:
>>> alignment.target
SeqRecord(seq=Seq({155: 'DLQLLIKAVNLFPAGTNSRWEVIANYMNIHSSSGVKRTAKDVIGKAKSLQKLDP...TKK'}, length=319), id='gi|12654095|gb|AAH00859.1|', name='AAH00859', description='Unknown (protein for IMAGE:3459481) [Homo sapiens]', dbxrefs=[])
>>> alignment.query
SeqRecord(seq=Seq('DLQLLIKAVNLFPAGTNSRWEVIANYMNIHSSSGVKRTAKDVIGKAKSLQKLDP...XKE'), id='gi|1347369|gb|G25137.1|G25137', name='<unknown name>', description='human STS EST48004, sequence tagged site', dbxrefs=[])
This was a BLASTX run, so the query sequence was translated:
>>> len(alignment.target.features)
0
>>> len(alignment.query.features)
1
>>> feature = alignment.query.features[0]
>>> feature
SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(133)), type='CDS', qualifiers=...)
>>> feature.qualifiers
{'coded_by': 'gi|1347369|gb|G25137.1|G25137:1..399'}
i.e., nucleotides 0:399 (in zero-based coordinates) encode the amino acids
of the query in the alignment.
For an alignment against the reverse strand, the location in the qualifier
is shown as in this example:
>>> record[72][0].query.features[0].qualifiers
{'coded_by': 'complement(gi|1347369|gb|G25137.1|G25137:345..530)'}
"""
def __init__(self):
"""Initialize the Record object."""
self.query = None
class Records:
"""Stores the BLAST results of a single BLAST run.
A ``Bio.Blast.Records`` object is an iterator. Iterating over it returns
returns ``Bio.Blast.Record`` objects, each of which corresponds to one
BLAST query.
Common attributes of a ``Bio.Blast.Records`` object are
- source: The input data from which the ``Bio.Blast.Records`` object
was constructed.
- program: The specific BLAST program that was used (e.g., 'blastn').
- version: The version of the BLAST program (e.g., 'BLASTN 2.2.27+').
- reference: The literature reference to the BLAST publication.
- db: The BLAST database against which the query was run
(e.g., 'nr').
- query: A ``SeqRecord`` object which may contain some or all of the
following information:
- query.id: SeqId of the query;
- query.description: Definition line of the query;
- query.seq: The query sequence. The query sequence.
The query sequence.
- param: A dictionary with the parameters used for the BLAST run.
You may find the following keys in this dictionary:
- 'matrix': the scoring matrix used in the BLAST run
(e.g., 'BLOSUM62') (string);
- 'expect': threshold on the expected number of chance
matches (float);
- 'include': e-value threshold for inclusion in
multipass model in psiblast (float);
- 'sc-match': score for matching nucleotides (integer);
- 'sc-mismatch': score for mismatched nucleotides
(integer);
- 'gap-open': gap opening cost (integer);
- 'gap-extend': gap extension cost (integer);
- 'filter': filtering options applied in the BLAST
run (string);
- 'pattern': PHI-BLAST pattern (string);
- 'entrez-query': Limit of request to Entrez query (string).
- mbstat: A dictionary with Mega BLAST search statistics. As this
information is stored near the end of the XML file, this
attribute can only be accessed after the file has been read
completely (by iterating over the records until a
``StopIteration`` is issued. This dictionary can contain the
same keys as the dictionary stored under the ``stat``
attribute of a ``Record`` object.
>>> from Bio import Blast
>>> path = "Blast/xml_2218_blastp_002.xml"
In a script, you would use a ``with`` block, as in
>>> with Blast.parse(path) as records:
... print(records.source)
...
Blast/xml_2218_blastp_002.xml
to ensure that the file is closed at the end of the block.
Here, we will simply do
>>> records = Blast.parse("Blast/xml_2218_blastp_002.xml")
so we can see the output of each command right away.
>>> type(records)
<class 'Bio.Blast.Records'>
>>> records.source
'Blast/xml_2218_blastp_002.xml'
>>> records.program
'blastp'
>>> records.version
'BLASTP 2.2.18+'
>>> records.reference
'Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.'
>>> records.db
'gpipe/9606/Previous/protein'
>>> records.param
{'matrix': 'BLOSUM62', 'expect': 0.01, 'gap-open': 11, 'gap-extend': 1, 'filter': 'm L; R -d repeat/repeat_9606;'}
Iterating over the records returns Bio.Blast.Record objects:
>>> record = next(records)
>>> type(record)
<class 'Bio.Blast.Record'>
>>> record.query.id
'gi|585505|sp|Q08386|MOPB_RHOCA'
>>> record = next(records)
>>> type(record)
<class 'Bio.Blast.Record'>
>>> record.query.id
'gi|129628|sp|P07175.1|PARA_AGRTU'
>>> record = next(records) # doctest:+ELLIPSIS
Traceback (most recent call last):
...
StopIteration
""" # noqa: RST201, RST203, RST301
def __init__(self, source):
"""Initialize the Records object."""
from Bio.Blast._parser import XMLHandler
self.source = source
try:
stream = open(source, "rb")
except TypeError: # not a path, assume we received a stream
if source.read(0) != b"":
raise StreamModeError(
"BLAST output files must be opened in binary mode."
) from None
stream = source
try: # context manager won't kick in until after parse returns
self._stream = stream
parser = expat.ParserCreate()
self._parser = parser
handler = XMLHandler(parser)
handler._records = self
while True:
data = stream.read(BLOCK)
if data == b"":
try:
handler._parser
except AttributeError:
break
else:
raise ValueError(
f"premature end of XML file (after reading {parser.CurrentByteIndex} bytes)"
)
try:
parser.Parse(data, False)
except expat.ExpatError as e:
if parser.StartElementHandler:
# We saw the initial <!xml declaration, so we can be
# sure that we are parsing XML data. Most likely, the
# XML file is corrupted.
raise CorruptedXMLError(e) from None
else:
# We have not seen the initial <!xml declaration, so
# probably the input data is not in XML format.
raise NotXMLError(e) from None
try:
self._cache
except AttributeError:
pass
else:
# We have finished reading the header
break
except Exception:
if stream is not source:
stream.close()
raise
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
try:
stream = self._stream
except AttributeError:
return
if stream is not self.source:
stream.close()
del self._stream
def __iter__(self):
return self
def __next__(self):
try:
cache = self._cache
except AttributeError:
raise StopIteration from None
parser = self._parser
stream = self._stream
while True:
try:
record = self._cache.popleft()
except IndexError: # no record ready to be returned
pass
else:
return record
# Read in another block of data from the file.
data = stream.read(BLOCK)
if data == b"":
del self._cache
del self._parser
if parser.StartElementHandler is not None:
raise ValueError(
f"premature end of XML file (after reading {parser.CurrentByteIndex} bytes)"
)
raise StopIteration
try:
parser.Parse(data, False)
except expat.ExpatError as e:
raise CorruptedXMLError(e) from None
def parse(source):
"""Parse an XML file containing BLAST output and return a Bio.Blast.Records object.
This returns an iterator object; iterating over it returns Bio.Blast.Record
objects one by one.
The source can be a file stream or the path to an XML file containing the
BLAST output. If a file stream, source must be in binary mode. This allows
the parser to detect the encoding from the XML file,and to use it to convert
any text in the XML to the correct Unicode string. The qblast function in
Bio.Blast returns a file stream in binary mode. For files, please use mode
"rb" when opening the file, as in
>>> from Bio import Blast
>>> stream = open("Blast/wnts.xml", "rb") # opened in binary mode
>>> records = Blast.parse(stream)
>>> for record in records:
... print(record.query.id, record.query.description)
...
Query_1 gi|195230749:301-1383 Homo sapiens wingless-type MMTV integration site family member 2 (WNT2), transcript variant 1, mRNA
Query_2 gi|325053704:108-1166 Homo sapiens wingless-type MMTV integration site family, member 3A (WNT3A), mRNA
Query_3 gi|156630997:105-1160 Homo sapiens wingless-type MMTV integration site family, member 4 (WNT4), mRNA
Query_4 gi|371502086:108-1205 Homo sapiens wingless-type MMTV integration site family, member 5A (WNT5A), transcript variant 2, mRNA
Query_5 gi|53729353:216-1313 Homo sapiens wingless-type MMTV integration site family, member 6 (WNT6), mRNA
>>> stream.close()
"""
return Records(source)
def read(source):
"""Parse an XML file containing BLAST output for a single query and return it.
Internally, this function uses Bio.Blast.parse to obtain an iterator over
BLAST records. The function then reads one record from the iterator,
ensures that there are no further records, and returns the record it found
as a Bio.Blast.Record object. An exception is raised if no records are
found, or more than one record is found.
The source can be a file stream or the path to an XML file containing the
BLAST output. If a file stream, source must be in binary mode. This allows
the parser to detect the encoding from the XML file,and to use it to convert
any text in the XML to the correct Unicode string. The qblast function in
Bio.Blast returns a file stream in binary mode. For files, please use mode
"rb" when opening the file, as in
>>> from Bio import Blast
>>> stream = open("Blast/xml_2900_blastn_001.xml", "rb") # opened in binary mode
>>> record = Blast.read(stream)
>>> record.query.id
'G26684.1'
>>> record.query.description
'human STS STS_D11570, sequence tagged site'
>>> len(record)
10
>>> stream.close()
Use the Bio.Blast.parse function if you want to read a file containing
BLAST output for more than one query.
"""
with parse(source) as records:
try:
record = next(records)
except StopIteration:
raise ValueError("No BLAST output found.") from None
try:
next(records)
raise ValueError("BLAST output for more than one query found.")
except StopIteration:
pass
return record
@function_with_previous
def qblast(
program,
database,
sequence,
url_base=NCBI_BLAST_URL,
auto_format=None,
composition_based_statistics=None,
db_genetic_code=None,
endpoints=None,
entrez_query="(none)",
expect=10.0,
filter=None,
gapcosts=None,
genetic_code=None,
hitlist_size=50,
i_thresh=None,
layout=None,
lcase_mask=None,
matrix_name=None,
nucl_penalty=None,
nucl_reward=None,
other_advanced=None,
perc_ident=None,
phi_pattern=None,
query_file=None,
query_believe_defline=None,
query_from=None,
query_to=None,
searchsp_eff=None,
service=None,
threshold=None,
ungapped_alignment=None,
word_size=None,
short_query=None,
alignments=500,
alignment_view=None,
descriptions=500,
entrez_links_new_window=None,
expect_low=None,
expect_high=None,
format_entrez_query=None,
format_object=None,
format_type="XML",
ncbi_gi=None,
results_file=None,
show_overview=None,
megablast=None,
template_type=None,
template_length=None,
username="blast",
password=None,
):
"""BLAST search using NCBI's QBLAST server or a cloud service provider.
Supports all parameters of the old qblast API for Put and Get.
Please note that NCBI uses the new Common URL API for BLAST searches
on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus,
some of the parameters used by this function are not (or are no longer)
officially supported by NCBI. Although they are still functioning, this
may change in the future.
The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows
doing BLAST searches on cloud servers. To use this feature, please set
``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'``
and ``format_object='Alignment'``. For more details, please see
https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast
Some useful parameters:
- program blastn, blastp, blastx, tblastn, or tblastx (lower case)
- database Which database to search against (e.g. "nr").
- sequence The sequence to search.
- ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
- descriptions Number of descriptions to show. Def 500.
- alignments Number of alignments to show. Def 500.
- expect An expect value cutoff. Def 10.0.
- matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
- filter "none" turns off filtering. Default no filtering
- format_type "XML" (default), "HTML", "Text", "XML2", "JSON2",
or "Tabular".
- entrez_query Entrez query to limit Blast search
- hitlist_size Number of hits to return. Default 50
- megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
- short_query TRUE/FALSE whether to adjust the search parameters for a
short query sequence. Note that this will override
manually set parameters like word size and e value. Turns
off when sequence length is > 30 residues. Default: None.
- service plain, psi, phi, rpsblast, megablast (lower case)
This function does no checking of the validity of the parameters
and passes the values to the server as is. More help is available at:
https://ncbi.github.io/blast-cloud/dev/api.html
"""
programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"]
if program not in programs:
raise ValueError(
f"Program specified is {program}. Expected one of {', '.join(programs)}"
)
# SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter
# assignment from NCBIs side).
# Thus we set the (known) parameters directly:
if short_query and program == "blastn":
short_query = None
# We only use the 'short-query' parameters for short sequences:
if len(sequence) < 31:
expect = 1000
word_size = 7
nucl_reward = 1
filter = None
lcase_mask = None
warnings.warn(
'"SHORT_QUERY_ADJUST" is incorrectly implemented (by NCBI) for blastn.'
" We bypass the problem by manually adjusting the search parameters."
" Thus, results may slightly differ from web page searches.",
BiopythonWarning,
)
# Format the "Put" command, which sends search requests to qblast.
# Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
# Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
# To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
# (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
parameters = {
"AUTO_FORMAT": auto_format,
"COMPOSITION_BASED_STATISTICS": composition_based_statistics,
"DATABASE": database,
"DB_GENETIC_CODE": db_genetic_code,
"ENDPOINTS": endpoints,
"ENTREZ_QUERY": entrez_query,
"EXPECT": expect,
"FILTER": filter,
"GAPCOSTS": gapcosts,
"GENETIC_CODE": genetic_code,
"HITLIST_SIZE": hitlist_size,
"I_THRESH": i_thresh,
"LAYOUT": layout,
"LCASE_MASK": lcase_mask,
"MEGABLAST": megablast,
"MATRIX_NAME": matrix_name,
"NUCL_PENALTY": nucl_penalty,
"NUCL_REWARD": nucl_reward,
"OTHER_ADVANCED": other_advanced,
"PERC_IDENT": perc_ident,
"PHI_PATTERN": phi_pattern,
"PROGRAM": program,
# ('PSSM': pssm: - It is possible to use PSI-BLAST via this API?
"QUERY": sequence,
"QUERY_FILE": query_file,
"QUERY_BELIEVE_DEFLINE": query_believe_defline,
"QUERY_FROM": query_from,
"QUERY_TO": query_to,
# 'RESULTS_FILE': ...: - Can we use this parameter?
"SEARCHSP_EFF": searchsp_eff,
"SERVICE": service,
"SHORT_QUERY_ADJUST": short_query,
"TEMPLATE_TYPE": template_type,
"TEMPLATE_LENGTH": template_length,
"THRESHOLD": threshold,
"UNGAPPED_ALIGNMENT": ungapped_alignment,
"WORD_SIZE": word_size,
"CMD": "Put",
}
if password is not None:
# handle authentication for BLAST cloud
password_mgr = HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url_base, username, password)
handler = HTTPBasicAuthHandler(password_mgr)
opener = build_opener(handler)
install_opener(opener)
if url_base == NCBI_BLAST_URL:
parameters.update({"email": email, "tool": tool})
parameters = {key: value for key, value in parameters.items() if value is not None}
message = urlencode(parameters).encode()
request = Request(url_base, message, {"User-Agent": "BiopythonClient"})
# Send off the initial query to qblast.
# Note the NCBI do not currently impose a rate limit here, other
# than the request not to make say 50 queries at once using multiple
# threads.
stream = urlopen(request)
# Format the "Get" command, which gets the formatted results from qblast
# Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
rid, rtoe = _parse_qblast_ref_page(stream)
parameters = {
"ALIGNMENTS": alignments,
"ALIGNMENT_VIEW": alignment_view,
"DESCRIPTIONS": descriptions,
"ENTREZ_LINKS_NEW_WINDOW": entrez_links_new_window,
"EXPECT_LOW": expect_low,
"EXPECT_HIGH": expect_high,
"FORMAT_ENTREZ_QUERY": format_entrez_query,
"FORMAT_OBJECT": format_object,
"FORMAT_TYPE": format_type,
"NCBI_GI": ncbi_gi,
"RID": rid,
"RESULTS_FILE": results_file,
"SERVICE": service,
"SHOW_OVERVIEW": show_overview,
"CMD": "Get",
}
parameters = {key: value for key, value in parameters.items() if value is not None}
message = urlencode(parameters).encode()
# Poll NCBI until the results are ready.
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
# 1. Do not contact the server more often than once every 10 seconds.
# 2. Do not poll for any single RID more often than once a minute.
# 3. Use the URL parameter email and tool, so that the NCBI
# can contact you if there is a problem.
# 4. Run scripts weekends or between 9 pm and 5 am Eastern time
# on weekdays if more than 50 searches will be submitted.
# --
# Could start with a 10s delay, but expect most short queries
# will take longer thus at least 70s with delay. Therefore,
# start with 20s delay, thereafter once a minute.
delay = 20 # seconds
while True:
current = time.time()
wait = qblast.previous + delay - current
if wait > 0:
time.sleep(wait)
qblast.previous = current + wait
else:
qblast.previous = current
# delay by at least 60 seconds only if running the request against the public NCBI API
if delay < 60 and url_base == NCBI_BLAST_URL:
# Wasn't a quick return, must wait at least a minute
delay = 60
request = Request(url_base, message, {"User-Agent": "BiopythonClient"})
stream = urlopen(request)
data = stream.peek()
if format_type == "HTML" and b"<title>NCBI Blast:</title>" in data:
continue
elif data.startswith(b"<!DOCTYPE html"):
continue
else:
break
if format_type == "XML":
assert data.startswith(b"<?xml ")
elif format_type == "HTML":
assert data.startswith(b"<!DOCTYPE html ")
elif format_type in ("Text", "Tabular"):
assert data.startswith(b"<p><!--\nQBlastInfoBegin")
elif format_type in ("XML2", "JSON2"):
assert data.startswith(b"PK\x03\x04") # zipped file
return stream
qblast.previous = 0
def _parse_qblast_ref_page(handle):
"""Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably
'Request Time of Execution' and RID would be 'Request Identifier'.
"""
s = handle.read().decode()
i = s.find("RID =")
if i == -1:
rid = None
else:
j = s.find("\n", i)
rid = s[i + len("RID =") : j].strip()
i = s.find("RTOE =")
if i == -1:
rtoe = None
else:
j = s.find("\n", i)
rtoe = s[i + len("RTOE =") : j].strip()
if not rid and not rtoe:
# Can we reliably extract the error message from the HTML page?
# e.g. "Message ID#24 Error: Failed to read the Blast query:
# Nucleotide FASTA provided for protein sequence"
# or "Message ID#32 Error: Query contains no data: Query
# contains no sequence data"
#
# This used to occur inside a <div class="error msInf"> entry:
i = s.find('<div class="error msInf">')
if i != -1:
msg = s[i + len('<div class="error msInf">') :].strip()
msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip()
if msg:
raise ValueError(f"Error message from NCBI: {msg}")
# In spring 2010 the markup was like this:
i = s.find('<p class="error">')
if i != -1:
msg = s[i + len('<p class="error">') :].strip()
msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip()
if msg:
raise ValueError(f"Error message from NCBI: {msg}")
# Generic search based on the way the error messages start:
i = s.find("Message ID#")
if i != -1:
# Break the message at the first HTML tag
msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip()
raise ValueError(f"Error message from NCBI: {msg}")
# We didn't recognise the error layout :(
# print(s)
raise ValueError(
"No RID and no RTOE found in the 'please wait' page, "
"there was probably an error in your request but we "
"could not extract a helpful error message."
)
elif not rid:
# Can this happen?
raise ValueError(
f"No RID found in the 'please wait' page. (although RTOE = {rtoe!r})"
)
elif not rtoe:
# Can this happen?
raise ValueError(
f"No RTOE found in the 'please wait' page. (although RID = {rid!r})"
)
try:
return rid, int(rtoe)
except ValueError:
raise ValueError(
f"A non-integer RTOE found in the 'please wait' page, {rtoe!r}"
) from None
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

859
Bio/Blast/_parser.py Normal file
View File

@ -0,0 +1,859 @@
# Copyright 1999 by Jeffrey Chang. All rights reserved.
# Copyright 2000 by Bertrand Frottier. All rights reserved.
# Revisions 2005-2006 copyright Michiel de Hoon
# Revisions 2006-2009 copyright Peter Cock
# Revisions 2023 by Michiel de Hoon. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Code to parse BLAST XML output, and to parse the BLAST DTD file defining the XML.
The BLAST XML DTD file is available on the NCBI site at:
https://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd
"""
import os.path
from collections import deque
from xml.parsers import expat
from typing import Dict, Callable
from Bio.Blast import Record
from Bio.Seq import Seq, reverse_complement
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, SimpleLocation
from Bio.Align import Alignment, Alignments
from Bio import Entrez
class DTDHandler:
"""Parser for the BLAST XML DTD file."""
def __init__(self):
"""Initialize the parser and parse the BLAST XML DTD file."""
parser = expat.ParserCreate()
parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
parser.ExternalEntityRefHandler = self._externalEntityRefHandler
self.parser = parser
self._externalEntityRefHandler(None, None, "NCBI_BlastOutput.dtd", None)
def _elementDeclHandler(self, name, model):
method_name = name.lower().replace("-", "_")
XMLHandler._start_methods[name] = getattr(XMLHandler, "_start_" + method_name)
XMLHandler._end_methods[name] = getattr(XMLHandler, "_end_" + method_name)
def _externalEntityRefHandler(self, context, base, systemId, publicId):
assert context is None
assert base is None
directory = Entrez.__path__[0]
path = os.path.join(directory, "DTDs", systemId)
parser = self.parser.ExternalEntityParserCreate(None)
parser.ElementDeclHandler = self._elementDeclHandler
with open(path, "rb") as stream:
parser.ParseFile(stream)
return 1
class XMLHandler:
"""Handler for BLAST XML data."""
_start_methods: Dict[str, Callable] = {}
_end_methods: Dict[str, Callable] = {}
def __init__(self, parser):
"""Initialize the expat parser."""
parser.XmlDeclHandler = self._xmlDeclHandler
parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
self._parser = parser
def _start_blastoutput(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_program(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_version(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_reference(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_db(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_query_id(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_query_def(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_mbstat(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_param(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
self._records.param = {}
def _start_parameters_matrix(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_expect(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_sc_match(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_sc_mismatch(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_include(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_gap_open(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_gap_extend(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_filter(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_pattern(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_parameters_entrez_query(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_iterations(self, name, attrs):
self._records._cache = deque()
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_query_len(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_blastoutput_query_seq(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_iteration(self, name, attrs):
record = Record()
self._record = record
def _start_iteration_iter_num(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_iteration_query_id(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_iteration_query_def(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_iteration_query_len(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_iteration_hits(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hit(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
self._alignment = Alignments()
def _start_hit_num(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hit_id(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hit_def(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hit_hsps(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hit_len(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hit_accession(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
self._hsp = {}
def _start_hsp_num(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_bit_score(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_score(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_evalue(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_query_from(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_query_to(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_hit_from(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_hit_to(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_pattern_from(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_pattern_to(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_query_frame(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_hit_frame(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_identity(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_positive(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_gaps(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_align_len(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_density(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_qseq(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_hseq(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_hsp_midline(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_iteration_stat(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_iteration_message(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_statistics(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
self._stat = {}
def _start_statistics_db_num(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_statistics_db_len(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_statistics_hsp_len(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_statistics_eff_space(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_statistics_kappa(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_statistics_lambda(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _start_statistics_entropy(self, name, attrs):
assert self._characters.strip() == ""
self._characters = ""
def _end_blastoutput(self, name):
assert self._characters.strip() == ""
parser = self._parser
parser.StartElementHandler = None
parser.EndElementHandler = None
parser.CharacterDataHandler = None
del self._characters
del self._records
del self._parser
def _end_blastoutput_program(self, name):
program = self._characters
self._program = program
self._records.program = program
self._characters = ""
def _end_blastoutput_version(self, name):
self._records.version = self._characters
self._characters = ""
def _end_blastoutput_reference(self, name):
self._records.reference = self._characters
self._characters = ""
def _end_blastoutput_db(self, name):
self._records.db = self._characters
self._characters = ""
def _end_blastoutput_query_id(self, name):
query_id = self._characters
self._records.query = SeqRecord(None, query_id)
self._characters = ""
def _end_blastoutput_query_def(self, name):
query_def = self._characters
self._records.query.description = query_def
self._characters = ""
def _end_blastoutput_query_len(self, name):
length = int(self._characters)
self._records.query.seq = Seq(None, length=length)
self._characters = ""
def _end_blastoutput_query_seq(self, name):
seq = Seq(self._characters)
self._characters = ""
assert len(seq) == len(self._records.query.seq)
self._records.query.seq = seq
def _end_blastoutput_mbstat(self, name):
assert self._characters.strip() == ""
self._characters = ""
self._records.mbstat = self._stat
del self._stat
def _end_blastoutput_param(self, name):
assert self._characters.strip() == ""
self._characters = ""
def _end_parameters(self, name):
assert self._characters.strip() == ""
self._characters = ""
def _end_parameters_matrix(self, name):
self._records.param["matrix"] = self._characters
self._characters = ""
def _end_parameters_expect(self, name):
self._records.param["expect"] = float(self._characters)
self._characters = ""
def _end_parameters_sc_match(self, name):
self._records.param["sc-match"] = int(self._characters)
self._characters = ""
def _end_parameters_sc_mismatch(self, name):
self._records.param["sc-mismatch"] = int(self._characters)
self._characters = ""
def _end_parameters_include(self, name):
self._records.param["include"] = float(self._characters)
self._characters = ""
def _end_parameters_gap_open(self, name):
self._records.param["gap-open"] = int(self._characters)
self._characters = ""
def _end_parameters_gap_extend(self, name):
self._records.param["gap-extend"] = int(self._characters)
self._characters = ""
def _end_parameters_filter(self, name):
self._records.param["filter"] = self._characters
self._characters = ""
def _end_parameters_pattern(self, name):
self._records.param["pattern"] = self._characters
self._characters = ""
def _end_parameters_entrez_query(self, name):
self._records.param["entrez-query"] = self._characters
self._characters = ""
def _end_blastoutput_iterations(self, name):
assert self._characters.strip() == ""
self._characters = ""
def _end_iteration(self, name):
assert self._characters.strip() == ""
self._characters = ""
self._records._cache.append(self._record)
del self._record
def _end_iteration_iter_num(self, name):
self._record.num = int(self._characters)
self._characters = ""
def _end_iteration_query_id(self, name):
query_id = self._characters
self._record.query = SeqRecord(None, query_id)
self._characters = ""
def _end_iteration_query_def(self, name):
query_def = self._characters
self._record.query.description = query_def
self._characters = ""
def _end_iteration_query_len(self, name):
length = int(self._characters)
self._record.query.seq = Seq(None, length=length)
self._characters = ""
def _end_iteration_hits(self, name):
assert self._characters.strip() == ""
self._characters = ""
def _end_hit(self, name):
assert self._characters.strip() == ""
self._characters = ""
alignment = self._alignment
del self._alignment
self._record.append(alignment)
def _end_hit_num(self, name):
num = int(self._characters)
if num != len(self._record) + 1:
raise ValueError(
f"unexpected value in tag <Hit_num> (found {num}, expected {len(self._record) + 1})"
)
self._characters = ""
def _end_hit_id(self, name):
hit_id = self._characters
self._alignment.target = SeqRecord(None, hit_id)
self._characters = ""
def _end_hit_def(self, name):
description = self._characters
self._alignment.target.description = description
self._characters = ""
def _end_hit_accession(self, name):
accession = self._characters
self._alignment.target.name = accession
self._characters = ""
def _end_hit_len(self, name):
length = int(self._characters)
self._alignment.target.seq = Seq(None, length=length)
self._characters = ""
def _end_hit_hsps(self, name):
assert self._characters.strip() == ""
self._characters = ""
def _end_hsp_num(self, name):
num = int(self._characters)
if num != len(self._alignment) + 1:
raise ValueError(
f"unexpected value in tag <Hsp_num> (found {num}, expected {len(self._alignment) + 1})"
)
self._characters = ""
def _end_hsp_bit_score(self, name):
self._hsp["bit-score"] = float(self._characters)
self._characters = ""
def _end_hsp_score(self, name):
self._hsp["score"] = float(self._characters)
self._characters = ""
def _end_hsp_evalue(self, name):
self._hsp["evalue"] = float(self._characters)
self._characters = ""
def _end_hsp_query_from(self, name):
self._hsp["query-from"] = int(self._characters)
self._characters = ""
def _end_hsp_query_to(self, name):
self._hsp["query-to"] = int(self._characters)
self._characters = ""
def _end_hsp_hit_from(self, name):
self._hsp["hit-from"] = int(self._characters)
self._characters = ""
def _end_hsp_hit_to(self, name):
self._hsp["hit-to"] = int(self._characters)
self._characters = ""
def _end_hsp_pattern_from(self, name):
self._hsp["pattern-from"] = int(self._characters)
self._characters = ""
def _end_hsp_pattern_to(self, name):
self._hsp["pattern-to"] = int(self._characters)
self._characters = ""
def _end_hsp_query_frame(self, name):
query_frame = int(self._characters)
if self._program in ("blastx", "tblastx") and query_frame in (
-3,
-2,
-1,
1,
2,
3,
):
pass
elif self._program == "blastp" and query_frame in (0, 1):
pass
elif self._program == "tblastn" and query_frame == 0:
pass
elif self._program in ("blastn", "megablast") and query_frame == 1:
pass
else:
raise ValueError(
f"unexpected value {query_frame} in tag <Hsp_query-frame> for program {self._program}"
)
self._hsp["query-frame"] = query_frame
self._characters = ""
def _end_hsp_hit_frame(self, name):
hit_frame = int(self._characters)
if self._program in "blastp" and hit_frame in (0, 1):
pass
elif self._program == "blastx" and hit_frame == 0:
pass
elif self._program in ("tblastn", "tblastx") and hit_frame in (
-3,
-2,
-1,
1,
2,
3,
):
pass
elif self._program in ("blastn", "megablast") and hit_frame in (-1, 1):
pass
else:
raise ValueError(
f"unexpected value {hit_frame} in tag <Hsp_hit-frame> for program {self._program}"
)
self._hsp["hit-frame"] = hit_frame
self._characters = ""
def _end_hsp_identity(self, name):
self._hsp["identity"] = int(self._characters)
self._characters = ""
def _end_hsp_positive(self, name):
self._hsp["positive"] = int(self._characters)
self._characters = ""
def _end_hsp_gaps(self, name):
self._hsp["gaps"] = int(self._characters)
self._characters = ""
def _end_hsp_align_len(self, name):
self._hsp["align-len"] = int(self._characters)
self._characters = ""
def _end_hsp_density(self, name):
self._hsp["density"] = int(self._characters)
self._characters = ""
def _end_hsp_qseq(self, name):
self._hsp["qseq"] = self._characters
self._characters = ""
def _end_hsp_hseq(self, name):
self._hsp["hseq"] = self._characters
self._characters = ""
def _end_hsp_midline(self, name):
self._hsp["midline"] = self._characters
self._characters = ""
def _end_hsp(self, name):
assert self._characters.strip() == ""
self._characters = ""
hsp = self._hsp
del self._hsp
align_len = hsp["align-len"]
query = self._record.query
if query is None:
query = self._records.query
query_id = query.id
query_description = query.description
query_length = len(query.seq)
query_seq_aligned = hsp["qseq"]
assert len(query_seq_aligned) == align_len
target_seq_aligned = hsp["hseq"]
assert len(target_seq_aligned) == align_len
coordinates = Alignment.infer_coordinates(
[target_seq_aligned, query_seq_aligned]
)
query_seq_data = query_seq_aligned.replace("-", "")
query_frame = hsp["query-frame"]
query = SeqRecord(None, query_id, description=query_description)
query_start = hsp["query-from"] - 1
query_end = hsp["query-to"]
if self._program in ("blastx", "tblastx"):
assert query_end - query_start == 3 * len(query_seq_data)
location = SimpleLocation(0, len(query_seq_data))
coded_by = f"{query_id}:{hsp['query-from']}..{hsp['query-to']}"
if query_frame > 0:
assert query_start % 3 == query_frame - 1
elif query_frame < 0:
assert (query_length - query_end) % 3 == -query_frame - 1
coded_by = f"complement({coded_by})"
qualifiers = {"coded_by": coded_by}
feature = SeqFeature(location, type="CDS", qualifiers=qualifiers)
query.features.append(feature)
else:
coordinates[1, :] += query_start
assert query_end - query_start == len(query_seq_data)
query_seq_data = {query_start: query_seq_data}
query.seq = Seq(query_seq_data, query_length)
target = self._alignment.target
target_id = target.id
target_name = target.name
target_description = target.description
target_length = len(target.seq)
target_seq_data = target_seq_aligned.replace("-", "")
target_frame = hsp["hit-frame"]
target = SeqRecord(None, target_id, target_name, description=target_description)
if self._program in ("tblastn", "tblastx"):
target_start = hsp["hit-from"] - 1
target_end = hsp["hit-to"]
assert target_end - target_start == 3 * len(target_seq_data)
target_seq = Seq(target_seq_data, target_length)
location = SimpleLocation(0, target_length)
coded_by = f"{target_id}:{hsp['hit-from']}..{hsp['hit-to']}"
if target_frame > 0:
assert target_start % 3 == target_frame - 1
elif query_frame < 0:
assert (target_length - target_end) % 3 == -target_frame - 1
coded_by = f"complement({coded_by})"
qualifiers = {"coded_by": coded_by}
feature = SeqFeature(location, type="CDS", qualifiers=qualifiers)
target.features.append(feature)
else:
if target_frame == +1 or target_frame == 0:
target_start = hsp["hit-from"] - 1
target_end = hsp["hit-to"]
coordinates[0, :] += target_start
elif target_frame == -1:
target_start = hsp["hit-to"] - 1
target_end = hsp["hit-from"]
target_seq_data = reverse_complement(target_seq_data)
coordinates[0, :] = target_end - coordinates[0, :]
assert target_end - target_start == len(target_seq_data)
target_seq_data = {target_start: target_seq_data}
target.seq = Seq(target_seq_data, target_length)
sequences = [target, query]
alignment = Alignment(sequences, coordinates)
alignment.score = hsp["score"]
annotations = {}
annotations["bit score"] = hsp["bit-score"]
annotations["evalue"] = hsp["evalue"]
annotations["identity"] = hsp["identity"]
annotations["positive"] = hsp["positive"]
try:
annotations["gaps"] = hsp["gaps"]
except KeyError: # missing in megablast
pass
annotations["midline"] = hsp["midline"]
alignment.annotations = annotations
self._alignment.append(alignment)
def _end_iteration_stat(self, name):
assert self._characters.strip() == ""
self._characters = ""
self._record.stat = self._stat
del self._stat
def _end_iteration_message(self, name):
self._record.message = self._characters
self._characters = ""
def _end_statistics(self, name):
assert self._characters.strip() == ""
self._characters = ""
def _end_statistics_db_num(self, name):
self._stat["db-num"] = int(self._characters)
self._characters = ""
def _end_statistics_db_len(self, name):
self._stat["db-len"] = int(self._characters)
self._characters = ""
def _end_statistics_hsp_len(self, name):
self._stat["hsp-len"] = int(self._characters)
self._characters = ""
def _end_statistics_eff_space(self, name):
self._stat["eff-space"] = float(self._characters)
self._characters = ""
def _end_statistics_kappa(self, name):
self._stat["kappa"] = float(self._characters)
self._characters = ""
def _end_statistics_lambda(self, name):
self._stat["lambda"] = float(self._characters)
self._characters = ""
def _end_statistics_entropy(self, name):
self._stat["entropy"] = float(self._characters)
self._characters = ""
def _xmlDeclHandler(self, version, encoding, standalone):
parser = self._parser
parser.ExternalEntityRefHandler = self._externalEntityRefHandler
parser.StartElementHandler = self._startElementHandler
parser.EndElementHandler = self._endElementHandler
parser.CharacterDataHandler = self._characterDataHandler
self._characters = ""
parser.XmlDeclHandler = None
def _externalEntityRefHandler(self, context, base, systemId, publicId):
"""Handle the DTD declaration."""
assert context is None
assert base is None
if systemId not in (
"NCBI_BlastOutput.dtd",
"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd",
):
raise ValueError("output from legacy BLAST program")
assert publicId == "-//NCBI//NCBI BlastOutput/EN"
self._parser.ExternalEntityRefHandler = None
return 1
def _startElementHandler(self, name, attr):
"""Found XML start tag.
No real need of attr, BLAST DTD doesn't use them
Arguments:
- name -- name of the tag
- attr -- tag attributes
"""
method = XMLHandler._start_methods.get(name)
if method is None:
raise ValueError("Failed to find method for %s" % name)
method(self, name, attr)
def _endElementHandler(self, name):
"""Found XML end tag.
Arguments:
- name -- tag name
"""
method = XMLHandler._end_methods.get(name)
if method is None:
raise ValueError("Failed to find method for %s" % name)
method(self, name)
def _characterDataHandler(self, characters):
"""Found some text.
Arguments:
- characters -- characters read
"""
self._characters += characters
def __iter__(self):
return self
def __repr__(self):
try:
stream = self._stream
except AttributeError:
stream = None
try:
parser = self._parser
except AttributeError:
parser = None
address = hex(id(self))
if stream is None and parser is None:
return f"<Bio.Blast._parser.XMLHandler object at {address} with no stream or parser>"
elif stream is None:
return f"<Bio.Blast._parser.XMLHandler object at {address} with parser {parser} and no stream>"
elif parser is None:
return f"<Bio.Blast._parser.XMLHandler object at {address} with stream {stream} and no parser>"
else:
return f"<Bio.Blast._parser.XMLHandler object at {address} with stream {stream} and parser {parser}>"
# Initialize XMLHandler by parsing the DTD
DTDHandler()

View File

@ -1,4 +1,4 @@
\chapter{BLAST}
\chapter{BLAST (new)}
\label{chapter:blast}
Hey, everybody loves BLAST right? I mean, geez, how can it get any easier to do comparisons between one of your sequences and every other sequence in the known world? But, of course, this section isn't about how cool BLAST is, since we already know that. It is about the problem with BLAST -- it can be really difficult to deal with the volume of data generated by large runs, and to automate BLAST runs in general.
@ -8,6 +8,726 @@ Dealing with BLAST can be split up into two steps, both of which can be done fro
Firstly, running BLAST for your query sequence(s), and getting some output.
Secondly, parsing the BLAST output in Python for further analysis.
Your first introduction to running BLAST was probably via the \href{https://blast.ncbi.nlm.nih.gov/Blast.cgi}{NCBI BLAST web page}.
In fact, there are lots of ways you can run BLAST, which can be categorized in several ways.
The most important distinction is running BLAST locally (on your own machine),
and running BLAST remotely (on another machine, typically the NCBI servers).
We're going to start this chapter by invoking the NCBI online BLAST service
from within a Python script.
\section{Running BLAST over the Internet}
\label{sec:running-www-blast}
We use the function \verb|qblast| in the \verb|Bio.Blast| module to call the
online version of BLAST.
The \href{https://blast.ncbi.nlm.nih.gov/doc/blast-help/developerinfo.html#developerinfo}{NCBI guidelines} state:
\begin{enumerate}
\item Do not contact the server more often than once every 10 seconds.
\item Do not poll for any single RID more often than once a minute.
\item Use the URL parameter email and tool, so that the NCBI can contact you if there is a problem.
\item Run scripts weekends or between 9 pm and 5 am Eastern time on weekdays if more than 50 searches will be submitted.
\end{enumerate}
\verb|Blast.qblast| follows the first two points automatically.
To fulfill the third point, set the \verb|Blast.email| variable (the
\verb|Blast.tool| variable is already set to \verb|"biopython"| by default):
%doctest
\begin{minted}{pycon}
>>> from Bio import Blast
>>> Blast.tool
'biopython'
>>> Blast.email = "A.N.Other@example.com"
\end{minted}
\subsection{BLAST arguments}
\label{subsec:blast-arguments}
The \verb|qblast| function has three non-optional arguments:
\begin{itemize}
\item The first argument is the BLAST program to use for the search, as a
lower case string. The programs and their options are described at the
\href{https://blast.ncbi.nlm.nih.gov/Blast.cgi}{NCBI BLAST web page}.
Currently \verb|qblast| only works with blastn, blastp, blastx, tblast
and tblastx.
\item The second argument specifies the databases to search against. Again,
the options for this are available on \href{https://blast.ncbi.nlm.nih.gov/doc/blast-help/}{NCBI's BLAST Help pages}.
\item The third argument is a string containing your query sequence. This
can either be the sequence itself, the sequence in fasta format,
or an identifier like a GI number.
\end{itemize}
The \verb|qblast| function also takes a number of other option arguments,
which are basically analogous to the different parameters you can set
on the BLAST web page. We'll just highlight a few of them here:
\begin{itemize}
\item The argument \verb|url_base| sets the base URL for running BLAST over the
internet. By default it connects to the NCBI, but one can use this to connect
to an instance of NCBI BLAST running in the cloud. Please refer to the documentation
for the \verb|qblast| function for further details.
\item The \verb|qblast| function can return the BLAST results in various
formats, which you can choose with the optional \verb|format_type| keyword:
\verb|"XML"|, \verb|"HTML"|, \verb|"Text"|, \verb|"XML2"|, \verb|"JSON2"|, or
\verb|"Tabular"|.
The default is \verb|"XML"|, as that is the format expected by the parser,
described in section~\ref{sec:parsing-blast} below.
\item The argument \verb|expect| sets the expectation or e-value threshold.
\end{itemize}
For more about the optional BLAST arguments, we refer you to the NCBI's own
documentation, or that built into Biopython:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> help(Blast.qblast)
\end{minted}
Note that the default settings on the NCBI BLAST website are not quite
the same as the defaults on QBLAST. If you get different results, you'll
need to check the parameters (e.g., the expectation value threshold and
the gap values).
For example, if you have a nucleotide sequence you want to search against
the nucleotide database (nt) using BLASTN, and you know the GI number of your
query sequence, you can use:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> result_stream = Blast.qblast("blastn", "nt", "8332116")
\end{minted}
Alternatively, if we have our query sequence already in a FASTA formatted
file, we just need to open the file and read in this record as a string,
and use that as the query argument:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> fasta_string = open("m_cold.fasta").read()
>>> result_stream = Blast.qblast("blastn", "nt", fasta_string)
\end{minted}
We could also have read in the FASTA file as a \verb|SeqRecord| and then
supplied just the sequence itself:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> from Bio import SeqIO
>>> record = SeqIO.read("m_cold.fasta", "fasta")
>>> result_stream = Blast.qblast("blastn", "nt", record.seq)
\end{minted}
Supplying just the sequence means that BLAST will assign an identifier
for your sequence automatically. You might prefer to call \verb|format|
on the \verb|SeqRecord| object to make a FASTA string
(which will include the existing identifier):
\begin{minted}{pycon}
>>> from Bio import Blast
>>> from Bio import SeqIO
>>> records = SeqIO.parse("ls_orchid.gbk", "genbank")
>>> record = next(records)
>>> result_stream = Blast.qblast("blastn", "nt", format(record, "fasta"))
\end{minted}
This approach makes more sense if you have your sequence(s) in a
non-FASTA file format which you can extract using \verb|Bio.SeqIO|
(see Chapter~\ref{chapter:seqio}).
\subsection{Saving BLAST results}
\label{subsec:saving-blast-results}
Whatever arguments you give the \verb|qblast()| function, you should
get back your results as a stream of \verb|bytes| data
(by default in XML format).
The next step would be to parse the XML output into Python objects
representing the search results (Section~\ref{sec:parsing-blast}),
but you might want to save a local copy of the output file first.
I find this especially useful when debugging my code that extracts
info from the BLAST results (because re-running the online search
is slow and wastes the NCBI computer time).
We need to be a bit careful since we can use \verb|result_stream.read()| to
read the BLAST output only once -- calling \verb|result_stream.read()| again
returns an empty \verb|bytes| object.
\begin{minted}{pycon}
>>> with open("my_blast.xml", "wb") as out_stream:
... out_stream.write(result_stream.read())
...
>>> result_stream.close()
\end{minted}
After doing this, the results are in the file \verb|my_blast.xml| and
\verb|result_stream| has had all its data extracted (so we closed it). However,
the \verb|parse| function of the BLAST parser (described
in~\ref{sec:parsing-blast}) takes a file-like object, so
we can just open the saved file for input as \verb|bytes|:
\begin{minted}{pycon}
>>> result_stream = open("my_blast.xml", "rb")
\end{minted}
Now that we've got the BLAST results back into a data stream again, we are ready
to do something with them, so this leads us right into the parsing section
(see Section~\ref{sec:parsing-blast} below). You may want to jump ahead to
that now \ldots.
\subsection{Obtaining BLAST output in other formats}
\label{subsec:blast-other-formats}
By using the \verb|format_type| argument when calling \verb|qblast|, you can obtain BLAST output in formats other than XML. Below is an example of reading BLAST output in JSON format. Using \verb|format_type="JSON2"|, the data provided by \verb|Blast.qblast| will be in zipped JSON format:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> from Bio import SeqIO
>>> record = SeqIO.read("m_cold.fasta", "fasta")
>>> result_stream = Blast.qblast("blastn", "nt", record.seq, format_type="JSON2")
>>> data = result_stream.read()
>>> data[:4]
b'PK\x03\x04'
\end{minted}
which is the ZIP file magic number.
\begin{minted}{pycon}
>>> with open("myzipfile.zip", "wb") as out_stream:
... out_stream.write(data)
...
13813
\end{minted}
Note that we read and write the data as \verb|bytes|. Now open the ZIP file we created:
\begin{minted}{pycon}
>>> import zipfile
>>> myzipfile = zipfile.ZipFile("myzipfile.zip")
>>> myzipfile.namelist()
['N5KN7UMJ013.json', 'N5KN7UMJ013_1.json']
>>> stream = myzipfile.open("N5KN7UMJ013.json")
>>> data = stream.read()
\end{minted}
These data are \verb|bytes|, so we need to decode them to get a string object:
\begin{minted}{pycon}
>>> data = data.decode()
>>> print(data)
{
"BlastJSON": [
{"File": "N5KN7UMJ013_1.json" }
]
}
\end{minted}
Now open the second file contained in the ZIP file to get the BLAST results in JSON format:
\begin{minted}{pycon}
>>> stream = myzipfile.open("N5KN7UMJ013_1.json")
>>> data = stream.read()
>>> len(data)
145707
>>> data = data.decode()
>>> print(data)
{
"BlastOutput2": {
"report": {
"program": "blastn",
"version": "BLASTN 2.14.1+",
"reference": "Stephen F. Altschul, Thomas L. Madden, Alejandro A. ...
"search_target": {
"db": "nt"
},
"params": {
"expect": 10,
"sc_match": 2,
"sc_mismatch": -3,
"gap_open": 5,
"gap_extend": 2,
"filter": "L;m;"
},
"results": {
"search": {
"query_id": "Query_69183",
"query_len": 1111,
"query_masking": [
{
"from": 797,
"to": 1110
}
],
"hits": [
{
"num": 1,
"description": [
{
"id": "gi|1219041180|ref|XM_021875076.1|",
...
\end{minted}
We can use the JSON parser in Python's standard library to convert the JSON data into a regular Python dictionary:
\begin{minted}{pycon}
>>> import json
>>> d = json.loads(data)
>>> print(d)
{'BlastOutput2': {'report': {'program': 'blastn', 'version': 'BLASTN 2.14.1+',
'reference': 'Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer,
Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997),
"Gapped BLAST and PSI-BLAST: a new generation of protein database search programs",
Nucleic Acids Res. 25:3389-3402.',
'search_target': {'db': 'nt'}, 'params': {'expect': 10, 'sc_match': 2,
'sc_mismatch': -3, 'gap_open': 5, 'gap_extend': 2, 'filter': 'L;m;'},
'results': {'search': {'query_id': 'Query_128889', 'query_len': 1111,
'query_masking': [{'from': 797, 'to': 1110}], 'hits': [{'num': 1,
'description': [{'id': 'gi|1219041180|ref|XM_021875076.1|', 'accession':
'XM_021875076', 'title':
'PREDICTED: Chenopodium quinoa cold-regulated 413 plasma membrane protein 2-like (LOC110697660), mRNA',
'taxid': 63459, 'sciname': 'Chenopodium quinoa'}], 'len': 1173, 'hsps':
[{'num': 1, 'bit_score': 435.898, 'score': 482, 'evalue': 9.02832e-117,
'identity': 473, 'query_from'
...
\end{minted}
\section{Running BLAST locally}
\label{sec:running-local-blast}
\subsection{Introduction}
Running BLAST locally (as opposed to over the internet, see
Section~\ref{sec:running-www-blast}) has at least major two advantages:
\begin{itemize}
\item Local BLAST may be faster than BLAST over the internet;
\item Local BLAST allows you to make your own database to search for sequences against.
\end{itemize}
Dealing with proprietary or unpublished sequence data can be another reason to run BLAST
locally. You may not be allowed to redistribute the sequences, so submitting them to the
NCBI as a BLAST query would not be an option.
Unfortunately, there are some major drawbacks too -- installing all the bits and getting
it setup right takes some effort:
\begin{itemize}
\item Local BLAST requires command line tools to be installed.
\item Local BLAST requires (large) BLAST databases to be setup (and potentially kept up to date).
\end{itemize}
\subsection{Standalone NCBI BLAST+}
The ``new''
\href{https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download}
{NCBI BLAST+} suite was released in 2009. This replaces the old NCBI ``legacy'' BLAST
package (see \ref{subsec:other-blast-versions}).
This section will show briefly how to use these tools from within Python. If you have
already read or tried the alignment tool examples in Section~\ref{sec:alignment-tools}
this should all seem quite straightforward. First, we construct a command line string
(as you would type in at the command line prompt if running standalone BLAST by hand).
Then we can execute this command from within Python.
For example, taking a FASTA file of gene nucleotide sequences, you might want to
run a BLASTX (translation) search against the non-redundant (NR) protein database.
Assuming you (or your systems administrator) has downloaded and installed the NR
database, you might run:
\begin{minted}{console}
$ blastx -query opuntia.fasta -db nr -out opuntia.xml -evalue 0.001 -outfmt 5
\end{minted}
This should run BLASTX against the NR database, using an expectation cut-off value
of $0.001$ and produce XML output to the specified file (which we can then parse).
On my computer this takes about six minutes - a good reason to save the output
to a file so you can repeat any analysis as needed.
From within python we can use the \verb|subprocess| module to build the command line
string, and run it:
\begin{minted}{pycon}
>>> import subprocess
>>> cmd = "blastx -query opuntia.fasta -db nr -out opuntia.xml"
>>> cmd += " -evalue 0.001 -outfmt 5"
>>> subprocess.run(cmd, shell=True)
\end{minted}
In this example there shouldn't be any output from BLASTX to the terminal. You may want
to check the output file \verb|opuntia.xml| has been created.
As you may recall from earlier examples in the tutorial, the \verb|opuntia.fasta|
contains seven sequences, so the BLAST XML output should contain multiple results.
Therefore use \verb|Bio.Blast.parse()| to parse it as described below in
Section~\ref{sec:parsing-blast}.
\subsection{Other versions of BLAST}
\label{subsec:other-blast-versions}
NCBI BLAST+ (written in C++) was first released in 2009 as a replacement for
the original NCBI ``legacy'' BLAST (written in C) which is no longer being updated.
You may also come across \href{http://blast.wustl.edu/}{Washington University BLAST}
(WU-BLAST), and its successor, \href{https://blast.advbiocomp.com}{Advanced Biocomputing
BLAST} (AB-BLAST, released in 2009, not free/open source). These packages include
the command line tools \verb|wu-blastall| and \verb|ab-blastall|, which mimicked
\verb|blastall| from the NCBI ``legacy'' BLAST suite.
Biopython does not currently provide wrappers for calling these tools, but should be able
to parse any NCBI compatible output from them.
\section{Parsing BLAST output}
\label{sec:parsing-blast}
As mentioned above, BLAST can generate output in various formats, such as
XML, HTML, and plain text. Originally, Biopython had parsers for BLAST
plain text and HTML output, as these were the only output formats offered
at the time. Unfortunately, the BLAST output in these formats kept changing,
each time breaking the Biopython parsers. Our HTML BLAST parser has been
removed, while the deprecated plain text BLAST parser is now only available
via \verb|Bio.SearchIO|. Use it at your own risk, it may or may not work,
depending on which BLAST version you're using.
As keeping up with changes in BLAST
became a hopeless endeavor, especially with users running different BLAST
versions, we now recommend to parse the output in XML format, which can be
generated by recent versions of BLAST. Not only is the XML output more stable
than the plain text and HTML output, it is also much easier to parse
automatically, making Biopython a whole lot more stable.
You can get BLAST output in XML format in various ways. For the parser, it
doesn't matter how the output was generated, as long as it is in the XML format.
\begin{itemize}
\item You can use Biopython to run BLAST over the internet, as described in
section~\ref{sec:running-www-blast}.
\item You can use Biopython to run BLAST locally, as described in
section~\ref{sec:running-local-blast}.
\item You can do the BLAST search yourself on the NCBI site through your
web browser, and then save the results. You need to choose XML as the format
in which to receive the results, and save the final BLAST page you get
(you know, the one with all of the interesting results!) to a file.
\item You can also run BLAST locally without using Biopython, and save
the output in a file. Again, you need to choose XML as the format in which
to receive the results.
\end{itemize}
The important point is that you do not have to use Biopython
scripts to fetch the data in order to be able to parse it.
Doing things in one of these ways, you then need to get a file-like object
to the results. In Python, a file-like object or handle is just a nice general way of describing input to any info source so that the info can be retrieved
using \verb|read()| and \verb|readline()| functions
(see Section~\ref{sec:appendix-handles}).
If you followed the code above for interacting with BLAST through a
script, then you already have \verb|result_stream|, the file-like object to the
BLAST results. For example, using a GI number to do an online search:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> result_stream = Blast.qblast("blastn", "nt", "8332116")
\end{minted}
If instead you ran BLAST some other way, and have the
BLAST output (in XML format) in the file \verb|my_blast.xml|, all you
need to do is to open the file for reading (as \verb|bytes|):
\begin{minted}{pycon}
>>> result_stream = open("my_blast.xml", "rb")
\end{minted}
Now that we've got a data stream, we are ready to parse the output. The
code to parse it is really quite small. If you expect a single
BLAST result (i.e., you used a single query):
\begin{minted}{pycon}
>>> from Bio import Blast
>>> blast_record = Blast.read(result_stream)
\end{minted}
\noindent or, if you have lots of results (i.e., multiple query sequences):
\begin{minted}{pycon}
>>> from Bio import Blast
>>> blast_records = Blast.parse(result_stream)
\end{minted}
Just like \verb|Bio.SeqIO| and \verb|Bio.Align|
(see Chapters~\ref{chapter:seqio} and~\ref{chapter:align}),
we have a pair of input functions, \verb|read| and \verb|parse|, where
\verb|read| is for when you have exactly one object, and \verb|parse|
is an iterator for when you can have lots of objects -- but instead of
getting \verb|SeqRecord| or \verb|Alignment| objects, we
get BLAST record objects.
To be able to handle the situation where the BLAST file may be huge,
containing thousands of results, \verb|Blast.parse()| returns an
iterator. In plain English, an iterator allows you to step through
the BLAST output, retrieving BLAST records one by one for each BLAST
search result:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> blast_records = Blast.parse(result_stream)
>>> blast_record = next(blast_records)
# ... do something with blast_record
>>> blast_record = next(blast_records)
# ... do something with blast_record
>>> blast_record = next(blast_records)
# ... do something with blast_record
>>> blast_record = next(blast_records)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
StopIteration
# No further records
\end{minted}
Or, you can use a \verb|for|-loop:
\begin{minted}{pycon}
>>> for blast_record in blast_records:
... pass # Do something with blast_record
...
\end{minted}
Note though that you can step through the BLAST records only once.
Usually, from each BLAST record you would save the information that
you are interested in. If you want to save all returned BLAST records,
you can convert the iterator into a list:
\begin{minted}{pycon}
>>> blast_records = list(blast_records)
\end{minted}
Now you can access each BLAST record in the list with an index as usual.
If your BLAST file is huge though, you may run into memory problems trying to
save them all in a list.
Instead of opening the file yourself, you can just provide the file name:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> with Blast.parse("my_blast_xml") as blast_records:
... for blast_record in blast_records:
... pass # Do something with blast_record
...
\end{minted}
In this case, Biopython opens the file for you, and closes it as soon as the file is not needed any more (while it is possible to simply use \verb|blast_records = Blast.parse("my_blast_xml")|, it has the disadvantage that the file may stay open longer than strictly necessary, thereby wasting resources).
Usually, you'll be running one BLAST search at a time. Then, all you need
to do is to pick up the first (and only) BLAST record in \verb|blast_records|:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> blast_records = Blast.parse(result_stream)
>>> blast_record = next(blast_records)
\end{minted}
\noindent or more elegantly:
\begin{minted}{pycon}
>>> from Bio import Blast
>>> blast_record = Blast.read(result_stream)
\end{minted}
or, equivalently,
\begin{minted}{pycon}
>>> from Bio import Blast
>>> blast_record = Blast.read("my_blast_xml")
\end{minted}
(here, you don't need to use a \verb|with| block as \verb|Blast.read| will read the whole file and close it immediately afterwards).
I guess by now you're wondering what is in a BLAST record.
\section{The BLAST Records and Record classes}
\subsection{The BLAST Records class}
\label{subsec:blast-records}
A single BLAST output file can contain output from multiple BLAST queries.
In Biopython, the information in a BLAST output file is stored in an \verb|Bio.Blast.Records| object. This is an iterator returning one \verb|Bio.Blast.Record| object (see subsection \ref{subsec:blast-record}) for each query. The \verb|Bio.Blast.Records| object has the following attributes describing the BLAST run:
\begin{itemize}
\item \verb|source|: The input data from which the \verb|Bio.Blast.Records| object was constructed (this could be a file name or path, or a file-like object).
\item \verb|program|: The specific BLAST program that was used (e.g., 'blastn').
\item \verb|version|: The version of the BLAST program (e.g., 'BLASTN 2.2.27+').
\item \verb|reference|: The literature reference to the BLAST publication.
\item \verb|db|: The BLAST database against which the query was run (e.g., 'nr').
\item \verb|query|: A \verb|SeqRecord| object which may contain some or all of the following information:
\begin{itemize}
\item \verb|query.id|: SeqId of the query;
\item \verb|query.description|: Definition line of the query;
\item \verb|query.seq|: The query sequence.
\end{itemize}
\item \verb|param|: A dictionary with the parameters used for the BLAST run. You may find the following keys in this dictionary:
\begin{itemize}
\item \verb|'matrix'|: the scoring matrix used in the BLAST run (e.g., 'BLOSUM62') (string);
\item \verb|'expect'|: threshold on the expected number of chance matches (float);
\item \verb|'include'|: e-value threshold for inclusion in multipass model in psiblast (float);
\item \verb|'sc-match'|: score for matching nucleotides (integer);
\item \verb|'sc-mismatch'|: score for mismatched nucleotides (integer;
\item \verb|'gap-open'|: gap opening cost (integer);
\item \verb|'gap-extend'|: gap extension cost (integer);
\item \verb|'filter'|: filtering options applied in the BLAST run (string);
\item \verb|'pattern'|: PHI-BLAST pattern (string);
\item \verb|'entrez-query'|: Limit of request to Entrez query (string).
\end{itemize}
\item \verb|mbstat|: A dictionary with Mega BLAST search statistics. See the description of the \verb|Record.stat| attribute below (in subsection \ref{subsec:blast-record}) for a description of the items in this dictionary. Only older versions of Mega BLAST store this information. As it is stored near the end of the BLAST output file, this attribute can only be accessed after the file has been read completely (by iterating over the records until a \verb|StopIteration| is issued).
\end{itemize}
For our example, we find:
\begin{minted}{pycon}
>>> blast_records.source
<_io.BufferedReader name='my_blast.xml'>
>>> blast_records.program
'blastn'
>>> blast_records.version
'BLASTN 2.2.27+'
>>> blast_records.reference
'Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.'
>>> blast_records.db
'refseq_rna'
>>> blast_records.param
{'expect': 10.0, 'sc-match': 2, 'sc-mismatch': -3, 'gap-open': 5, 'gap-extend': 2, 'filter': 'L;m;'}
\end{minted}
\subsection{The BLAST Record class}
\label{subsec:blast-record}
A \verb|Bio.Blast.Record| object stores the information provided by BLAST for a single query. The \verb|Bio.Blast.Record| class inherits from \verb+list+, and is essentially a list of \verb|Bio.Align.Alignments| objects (see \ref{sec:alignments}). A \verb|Bio.Blast.Record| object has the following two attributes:
\begin{itemize}
\item \verb|query|: A \verb|SeqRecord| object which may contain some or all of the following information:
\begin{itemize}
\item \verb|query.id|: SeqId of the query;
\item \verb|query.description|: Definition line of the query;
\item \verb|query.seq|: The query sequence.
\end{itemize}
\item \verb|stat|: A dictionary with statistical data of the BLAST hit. You may find the following keys in this dictionary:
\begin{itemize}
\item \verb|'db-num'|: number of sequences in BLAST db (integer);
\item \verb|'db-len'|: length of BLAST db (integer);
\item \verb|'hsp-len'|: effective HSP (High Scoring Pair) length (integer);
\item \verb|'eff-space'|: effective search space (float);
\item \verb|'kappa'|: Karlin-Altschul parameter K (float);
\item \verb|'lambda'|: Karlin-Altschul parameter Lambda (float);
\item \verb|'entropy'|: Karlin-Altschul parameter H (float)
\end{itemize}
\item \verb|message|: Some (error?) information.
\end{itemize}
Continuing with our example,
\begin{minted}{pycon}
>>> blast_record.query
SeqRecord(seq=Seq(None, length=61), id='42291', name='<unknown name>', description='mystery_seq', dbxrefs=[])
>>> blast_record.stat
{'db-num': 3056429, 'db-len': 673143725, 'hsp-len': 0, 'eff-space': 0.0, 'kappa': 0.41, 'lambda': 0.625, 'entropy': 0.78}
\end{minted}
Each \verb|Bio.Align.Alignments| object in the \verb|blast_record| list represents one BLAST hit of the query against a target:
\begin{minted}{pycon}
>>> type(blast_record[0])
<class 'Bio.Align.Alignments'>
>>> blast_record[0].target
SeqRecord(seq=Seq(None, length=61), id='gi|262205317|ref|NR_030195.1|', name='NR_030195', description='Homo sapiens microRNA 520b (MIR520B), microRNA', dbxrefs=[])
\end{minted}
Each hit is represented by a \verb|Bio.Align.Alignments| (plural) object, as the target and the query can align to each other in multiple but distinct regions.
However, typically a hit consists of only one or a few alignments, especially for alignments of highly homologous sequences. Each alignment is represented by a \verb|Bio.Align.Alignment| (singular) object (see Section \ref{sec:alignmentobject}).
\begin{minted}{pycon}
>>> len(blast_record[0])
1
>>> alignment = blast_record[0][0]
>>> type(alignment)
<class 'Bio.Align.Alignment'>
\end{minted}
The \verb|alignment| object has attributes pointing to the target and query sequences, as well as a \verb|coordinates| attribute describing the sequence alignment.
For translated BLAST searches, the \verb|features| attribute of the target or query may contain a \verb|SeqFeature| of type CDS that stores the amino acid sequence region. The \verb|qualifiers| attribute of such a feature is a dictionary with a single key \verb|'coded_by'|; the corresponding value specifies the nucleotide sequence region, in a GenBank-style string with 1-based coordinates, that encodes the amino acid sequence.
Each \verb|Alignment| object has the following additional attributes:
\begin{itemize}
\item \verb|score|: score of the High Scoring Pair (HSP);
\item \verb|annotations|: a dictionary that may contain the following keys:
\begin{itemize}
\item \verb|'bit score'|: score (in bits) of HSP (float);
\item \verb|'evalue'|: e-value of HSP (float);
\item \verb|'identity|': number of identities in HSP (integer);
\item \verb|'positive'|: number of positives in HSP (integer);
\item \verb|'gaps'|: number of gaps in HSP (integer);
\item \verb|'midline'|: formatting middle line.
\end{itemize}
\end{itemize}
The usual \verb|Alignment| methods (see Section \ref{sec:alignmentobject}) can therefore be applied to \verb|alignment|. For example, we can print the alignment:
\begin{minted}{pycon}
>>> alignment.target
SeqRecord(seq=Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG'), id='gi|262205317|ref|NR_030195.1|', name='NR_030195', description='Homo sapiens microRNA 520b (MIR520B), microRNA', dbxrefs=[])
>>> alignment.query
SeqRecord(seq=Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG'), id='42291', name='<unknown name>', description='mystery_seq', dbxrefs=[])
>>> print(alignment)
gi|262205 0 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGG
0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
42291 0 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGG
gi|262205 60 G 61
60 | 61
42291 60 G 61
>>> alignment.coordinates
array([[ 0, 61],
[ 0, 61]])
\end{minted}
Let's just print out some summary info about all hits in our blast report
greater than a particular threshold. The following code does this:
\begin{minted}{pycon}
>>> E_VALUE_THRESH = 0.04
>>> for alignments in blast_record:
... for alignment in alignments:
... if alignment.evalue < E_VALUE_THRESH:
... print("****Alignment****")
... print("sequence:", alignment.target.id, alignment.target.description)
... print("length:", len(alignment.target))
... print("score:", alignment.score)
... print("e value:", alignment.annotations["evalue"])
... print(alignment[:, :50])
...
\end{minted}
This will print out summary reports like the following:
\begin{minted}{text}
****Alignment****
sequence: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520B), microRNA
length: 61
score: 122.0
gi|262205 0 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTC 50
0 |||||||||||||||||||||||||||||||||||||||||||||||||| 50
42291 0 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTC 50
****Alignment****
sequence: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b (MIR520B), microRNA
length: 60
score: 120.0
gi|301171 0 CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCC 50
0 |||||||||||||||||||||||||||||||||||||||||||||||||| 50
42291 1 CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCC 51
****Alignment****
sequence: gi|270133242|ref|NR_032573.1| Macaca mulatta microRNA mir-519a (MIR519A), microRNA
length: 85
score: 112.0
gi|270133 12 CCCTCTAGAGGGAAGCGCTTTCTGTGGTCTGAAAGAAAAGAAAGTGCTTC 62
0 |||||||.|||||||||||||||||.|||||||||||||||||||||||| 50
42291 0 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTC 50
...
\end{minted}
\section{Dealing with PSI-BLAST}
You can run the standalone version of PSI-BLAST (\verb|psiblast|) directly
from the command line or using python's \verb|subprocess| module.
At the time of writing, the NCBI do not appear to support tools running a
PSI-BLAST search via the internet.
Note that the \verb|Bio.Blast| parser can read the XML output from
current versions of PSI-BLAST, but information like which sequences in each
iteration is new or reused isn't present in the XML file.
\section{Dealing with RPS-BLAST}
You can run the standalone version of RPS-BLAST (\verb|rpsblast|) directly
from the command line or using python's \verb|subprocess| module.
At the time of writing, the NCBI do not appear to support tools running an
RPS-BLAST search via the internet.
You can use the \verb|Bio.Blast| parser to read the XML output from
current versions of RPS-BLAST.
\chapter{BLAST (old)}
\label{chapter:blast_old}
Hey, everybody loves BLAST right? I mean, geez, how can it get any easier to do comparisons between one of your sequences and every other sequence in the known world? But, of course, this section isn't about how cool BLAST is, since we already know that. It is about the problem with BLAST -- it can be really difficult to deal with the volume of data generated by large runs, and to automate BLAST runs in general.
Fortunately, the Biopython folks know this only too well, so they've developed lots of tools for dealing with BLAST and making things much easier. This section details how to use these tools and do useful things with them.
Dealing with BLAST can be split up into two steps, both of which can be done from within Biopython.
Firstly, running BLAST for your query sequence(s), and getting some output.
Secondly, parsing the BLAST output in Python for further analysis.
Your first introduction to running BLAST was probably via the NCBI web-service.
In fact, there are lots of ways you can run BLAST, which can be categorized in several ways.
The most important distinction is running BLAST locally (on your own machine),
@ -22,7 +742,6 @@ other related sequence searching tools as well. However, for now you can use
either that or the older \verb|Bio.Blast| module for dealing with NCBI BLAST.
\section{Running BLAST over the Internet}
\label{sec:running-www-blast}
We use the function \verb|qblast()| in the \verb|Bio.Blast.NCBIWWW| module
to call the online version of BLAST. This has three non-optional arguments:
@ -34,13 +753,13 @@ Currently \verb|qblast| only works with blastn, blastp, blastx, tblast
and tblastx.
\item The second argument specifies the databases to search against. Again,
the options for this are available on the NCBI Guide to BLAST
\url{ftp://ftp.ncbi.nlm.nih.gov/pub/factsheets/HowTo_BLASTGuide.pdf}.
\url{https://blast.ncbi.nlm.nih.gov/doc/blast-help/}.
\item The third argument is a string containing your query sequence. This
can either be the sequence itself, the sequence in fasta format,
or an identifier like a GI number.
\end{itemize}
NCBI guidelines, from \url{https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo} state:
The NCBI guidelines, from \url{https://blast.ncbi.nlm.nih.gov/doc/blast-help/developerinfo.html#developerinfo} state:
\begin{enumerate}
\item Do not contact the server more often than once every 10 seconds.
\item Do not poll for any single RID more often than once a minute.
@ -139,8 +858,6 @@ I find this especially useful when debugging my code that extracts
info from the BLAST results (because re-running the online search
is slow and wastes the NCBI computer time).
\label{sec:saving-blast-output}
We need to be a bit careful since we can use \verb|result_handle.read()| to
read the BLAST output only once -- calling \verb|result_handle.read()| again
returns an empty string.
@ -168,7 +885,6 @@ to do something with them, so this leads us right into the parsing section
that now \ldots.
\section{Running BLAST locally}
\label{sec:running-local-blast}
\subsection{Introduction}
@ -260,7 +976,6 @@ Biopython does not currently provide wrappers for calling these tools, but shoul
to parse any NCBI compatible output from them.
\section{Parsing BLAST output}
\label{sec:parsing-blast}
As mentioned above, BLAST can generate output in various formats, such as
XML, HTML, and plain text. Originally, Biopython had parsers for BLAST

9
Tests/Blast/broken1.xml Normal file
View File

@ -0,0 +1,9 @@
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.2.12 [Aug-07-2005]</BlastOutput_version>
<BlastOutput_reference>Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>nr</BlastOutput_db>
<BlastOutput_query-ID>gi|1348916|gb|G26684.1|G26684</BlastOutput_query-ID>
<BlastOutput_query-def>human STS STS_D

30
Tests/Blast/broken2.xml Normal file
View File

@ -0,0 +1,30 @@
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.2.12 [Aug-07-2005]</BlastOutput_version>
<BlastOutput_reference>Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>nr</BlastOutput_db>
<BlastOutput_query-ID>gi|1348916|gb|G26684.1|G26684</BlastOutput_query-ID>
<BlastOutput_query-def>human STS STS_D11570, sequence tagged site</BlastOutput_query-def>
<BlastOutput_query-len>285</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>10</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-3</Parameters_sc-mismatch>
<Parameters_gap-open>5</Parameters_gap-open>
<Parameters_gap-extend>2</Parameters_gap-extend>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>gi|1348916|gb|G26684.1|G26684</Iteration_query-ID>
<Iteration_query-def>human STS STS_D11570, sequence tagged site</Iteration_query-def>
<Iteration_query-len>285</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|9950606|gb|AE004854.1|</Hit_id>
<Hit_def>Pseudomonas

76
Tests/Blast/broken3.xml Normal file
View File

@ -0,0 +1,76 @@
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.2.12 [Aug-07-2005]</BlastOutput_version>
<BlastOutput_reference>Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>nr</BlastOutput_db>
<BlastOutput_query-ID>gi|1348916|gb|G26684.1|G26684</BlastOutput_query-ID>
<BlastOutput_query-def>human STS STS_D11570, sequence tagged site</BlastOutput_query-def>
<BlastOutput_query-len>285</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>10</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-3</Parameters_sc-mismatch>
<Parameters_gap-open>5</Parameters_gap-open>
<Parameters_gap-extend>2</Parameters_gap-extend>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>gi|1348916|gb|G26684.1|G26684</Iteration_query-ID>
<Iteration_query-def>human STS STS_D11570, sequence tagged site</Iteration_query-def>
<Iteration_query-len>285</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|9950606|gb|AE004854.1|</Hit_id>
<Hit_def>Pseudomonas aeruginosa PAO1, section 415 of 529 of the complete genome</Hit_def>
<Hit_accession>AE004854</Hit_accession>
<Hit_len>11884</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>38.1576</Hsp_bit-score>
<Hsp_score>19</Hsp_score>
<Hsp_evalue>1.0598</Hsp_evalue>
<Hsp_query-from>68</Hsp_query-from>
<Hsp_query-to>86</Hsp_query-to>
<Hsp_hit-from>6012</Hsp_hit-from>
<Hsp_hit-to>6030</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>1</Hsp_hit-frame>
<Hsp_identity>19</Hsp_identity>
<Hsp_positive>19</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>19</Hsp_align-len>
<Hsp_qseq>CAGGCCAGCGACTTCTGGG</Hsp_qseq>
<Hsp_hseq>CAGGCCAGCGACTTCTGGG</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|15073988|emb|AL591786.1|SME591786</Hit_id>
<Hit_def>Sinorhizobium meliloti 1021 complete chromosome; segment 5/12</Hit_def>
<Hit_accession>AL591786</Hit_accession>
<Hit_len>299350</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>36.1753</Hsp_bit-score>
<Hsp_score>18</Hsp_score>
<Hsp_evalue>4.18768</Hsp_evalue>
<Hsp_query-from>204</Hsp_query-from>
<Hsp_query-to>224</Hsp_query-to>
<Hsp_hit-from>83648</Hsp_hit-from>
<Hsp_hit-to>83628</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>20</Hsp_identity>
<Hsp_positive>20</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align

95
Tests/Blast/broken4.xml Normal file
View File

@ -0,0 +1,95 @@
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.2.12 [Aug-07-2005]</BlastOutput_version>
<BlastOutput_reference>Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>nr</BlastOutput_db>
<BlastOutput_query-ID>gi|1348916|gb|G26684.1|G26684</BlastOutput_query-ID>
<BlastOutput_query-def>human STS STS_D11570, sequence tagged site</BlastOutput_query-def>
<BlastOutput_query-len>285</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>10</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-3</Parameters_sc-mismatch>
<Parameters_gap-open>5</Parameters_gap-open>
<Parameters_gap-extend>2</Parameters_gap-extend>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>gi|1348916|gb|G26684.1|G26684</Iteration_query-ID>
<Iteration_query-def>human STS STS_D11570, sequence tagged site</Iteration_query-def>
<Iteration_query-len>285</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|9950606|gb|AE004854.1|</Hit_id>
<Hit_def>Pseudomonas aeruginosa PAO1, section 415 of 529 of the complete genome</Hit_def>
<Hit_accession>AE004854</Hit_accession>
<Hit_len>11884</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>38.1576</Hsp_bit-score>
<Hsp_score>19</Hsp_score>
<Hsp_evalue>1.0598</Hsp_evalue>
<Hsp_query-from>68</Hsp_query-from>
<Hsp_query-to>86</Hsp_query-to>
<Hsp_hit-from>6012</Hsp_hit-from>
<Hsp_hit-to>6030</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>1</Hsp_hit-frame>
<Hsp_identity>19</Hsp_identity>
<Hsp_positive>19</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>19</Hsp_align-len>
<Hsp_qseq>CAGGCCAGCGACTTCTGGG</Hsp_qseq>
<Hsp_hseq>CAGGCCAGCGACTTCTGGG</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|15073988|emb|AL591786.1|SME591786</Hit_id>
<Hit_def>Sinorhizobium meliloti 1021 complete chromosome; segment 5/12</Hit_def>
<Hit_accession>AL591786</Hit_accession>
<Hit_len>299350</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>36.1753</Hsp_bit-score>
<Hsp_score>18</Hsp_score>
<Hsp_evalue>4.18768</Hsp_evalue>
<Hsp_query-from>204</Hsp_query-from>
<Hsp_query-to>224</Hsp_query-to>
<Hsp_hit-from>83648</Hsp_hit-from>
<Hsp_hit-to>83628</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>20</Hsp_identity>
<Hsp_positive>20</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>21</Hsp_align-len>
<Hsp_qseq>TGAAAGGAAATNAAAATGGAA</Hsp_qseq>
<Hsp_hseq>TGAAAGGAAATCAAAATGGAA</Hsp_hseq>
<Hsp_midline>||||||||||| |||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>371021</Statistics_db-num>
<Statistics_db-len>1233631384</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.710603</Statistics_kappa>
<Statistics_lambda>1.37406</Statistics_lambda>
<Statistics_entropy>1.30725</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>

36
Tests/Blast/broken5.xml Normal file
View File

@ -0,0 +1,36 @@
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastp</BlastOutput_program>
<BlastOutput_version>blastp 2.2.18 [Mar-02-2008]</BlastOutput_version>
<BlastOutput_reference>~Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, ~Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), ~&quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search~programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>/Users/pjcock/Downloads/Software/blast-2.2.18/data/nr</BlastOutput_db>
<BlastOutput_query-ID>lcl|1_0</BlastOutput_query-ID>
<BlastOutput_query-def
<BlastOutput_query-len>9</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_matrix>BLOSUM62</Parameters_matrix>
<Parameters_expect>1e-05</Parameters_expect>
<Parameters_gap-open>11</Parameters_gap-open>
<Parameters_gap-extend>1</Parameters_gap-extend>
<Parameters_filter>F</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_stat>
<Statistics>
<Statistics_db-num>6589360</Statistics_db-num>
<Statistics_db-len>2253133281</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>2.02782e+10</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>

View File

@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>megablast</BlastOutput_program>
<BlastOutput_version>megablast 2.2.26 [Sep-21-2011]</BlastOutput_version>
<BlastOutput_reference>~Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, ~Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), ~&quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search~programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>m_cold.fasta</BlastOutput_db>
<BlastOutput_query-ID>lcl|1_</BlastOutput_query-ID>
<BlastOutput_query-def>gi|8332116|gb|BE037100.1|BE037100 MP14H09 MP Mesembryanthemum crystallinum cDNA 5&apos; similar to cold acclimation protein, mRNA sequence</BlastOutput_query-def>
<BlastOutput_query-len>1111</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>10</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-3</Parameters_sc-mismatch>
<Parameters_gap-open>0</Parameters_gap-open>
<Parameters_gap-extend>0</Parameters_gap-extend>
<Parameters_filter>D</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>0</Iteration_iter-num>
<Iteration_query-ID>lcl|1_</Iteration_query-ID>
<Iteration_query-def>gi|8332116|gb|BE037100.1|BE037100 MP14H09 MP Mesembryanthemum crystallinum cDNA 5&apos; similar to cold acclimation protein, mRNA sequence</Iteration_query-def>
<Iteration_query-len>1111</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gnl|BL_ORD_ID|0</Hit_id>
<Hit_def>gi|8332116|gb|BE037100.1|BE037100 MP14H09 MP Mesembryanthemum crystallinum cDNA 5&apos; similar to cold acclimation protein, mRNA sequence</Hit_def>
<Hit_accession>0</Hit_accession>
<Hit_len>1111</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1562.59</Hsp_bit-score>
<Hsp_score>788</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>797</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>797</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>1</Hsp_hit-frame>
<Hsp_identity>797</Hsp_identity>
<Hsp_positive>797</Hsp_positive>
<Hsp_align-len>797</Hsp_align-len>
<Hsp_qseq>CACTAGTACTCGAGCGTNCTGCACCAATTCGGCACGAGCAAGTGACTACGTTNTGTGAACAGAAAATGGGGAGAGAAATGAAGTACTTGGCCATGAAAACTGATCAATTGGCCGTGGCTAATATGATCGATTCCGATATCAATGAGCTTAAAATGGCAACAATGAGGCTCATCAATGATGCTAGTATGCTCGGTCATTACGGGTTTGGCACTCATTTCCTCAAATGGCTCGCCTGCCTTGCGGCTATTTACTTGTTGATATTGGATCGAACAAACTGGAGAACCAACATGCTCACGTCACTTTTAGTCCCTTACATATTCCTCAGTCTTCCATCCGGGCCATTTCATCTGTTCAGAGGCGAGGTCGGGAAATGGATTGCCATCATTGCAGTCGTGTTAAGGCTGTTCTTCAACCGGCATTTCCCAGTTTGGCTGGAAATGCCTGGATCGTTGATACTCCTCCTGGTGGTGGCACCAGACTTCTTTACACACAAAGTGAAGGAGAGCTGGATCGGAATTGCAATTATGATAGCGATAGGGTGTCACCTGATGCAAGAACATATCAGAGCCACTGGTGGCTTTTGGAATTCCTTCACACAGAGCCACGGAACTTTTAACACAATTGGGCTTATCCTTCTACTGGCTTACCCTGTCTGTTTATGGTCATCTTCATGATGTAGTAGCTTAGTCTTGATCCTAATCCTCAAATNTACTTTTCCAGCTCTTTCGACGCTCTTGCTAAAGCCCATTCAATTCGCCCCATATTTCGCACACATTCATTTCACCACCCAATACGTG</Hsp_qseq>
<Hsp_hseq>CACTAGTACTCGAGCGTNCTGCACCAATTCGGCACGAGCAAGTGACTACGTTNTGTGAACAGAAAATGGGGAGAGAAATGAAGTACTTGGCCATGAAAACTGATCAATTGGCCGTGGCTAATATGATCGATTCCGATATCAATGAGCTTAAAATGGCAACAATGAGGCTCATCAATGATGCTAGTATGCTCGGTCATTACGGGTTTGGCACTCATTTCCTCAAATGGCTCGCCTGCCTTGCGGCTATTTACTTGTTGATATTGGATCGAACAAACTGGAGAACCAACATGCTCACGTCACTTTTAGTCCCTTACATATTCCTCAGTCTTCCATCCGGGCCATTTCATCTGTTCAGAGGCGAGGTCGGGAAATGGATTGCCATCATTGCAGTCGTGTTAAGGCTGTTCTTCAACCGGCATTTCCCAGTTTGGCTGGAAATGCCTGGATCGTTGATACTCCTCCTGGTGGTGGCACCAGACTTCTTTACACACAAAGTGAAGGAGAGCTGGATCGGAATTGCAATTATGATAGCGATAGGGTGTCACCTGATGCAAGAACATATCAGAGCCACTGGTGGCTTTTGGAATTCCTTCACACAGAGCCACGGAACTTTTAACACAATTGGGCTTATCCTTCTACTGGCTTACCCTGTCTGTTTATGGTCATCTTCATGATGTAGTAGCTTAGTCTTGATCCTAATCCTCAAATNTACTTTTCCAGCTCTTTCGACGCTCTTGCTAAAGCCCATTCAATTCGCCCCATATTTCGCACACATTCATTTCACCACCCAATACGTG</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
</Iteration>
</BlastOutput_iterations>
<BlastOutput_mbstat>
<Statistics>
<Statistics_db-num>1</Statistics_db-num>
<Statistics_db-len>1111</Statistics_db-len>
<Statistics_hsp-len>10</Statistics_hsp-len>
<Statistics_eff-space>1.2122e+06</Statistics_eff-space>
<Statistics_kappa>0.710603</Statistics_kappa>
<Statistics_lambda>1.37406</Statistics_lambda>
<Statistics_entropy>1.30725</Statistics_entropy>
</Statistics>
</BlastOutput_mbstat>
</BlastOutput>

322
Tests/Blast/phiblast.xml Normal file
View File

@ -0,0 +1,322 @@
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastp</BlastOutput_program>
<BlastOutput_version>BLASTP 2.14.1+</BlastOutput_version>
<BlastOutput_reference>Zheng Zhang, Alejandro A. Sch&amp;auml;ffer, Webb Miller, Thomas L. Madden, David J. Lipman, Eugene V. Koonin, and Stephen F. Altschul (1998), &quot;Protein sequence similarity searches using patterns as seeds&quot;, Nucleic Acids Res. 26:3986-3990.</BlastOutput_reference>
<BlastOutput_db>nr</BlastOutput_db>
<BlastOutput_query-ID>Query_74414</BlastOutput_query-ID>
<BlastOutput_query-def>unnamed protein product</BlastOutput_query-def>
<BlastOutput_query-len>664</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_matrix>BLOSUM62</Parameters_matrix>
<Parameters_expect>0.05</Parameters_expect>
<Parameters_gap-open>11</Parameters_gap-open>
<Parameters_gap-extend>1</Parameters_gap-extend>
<Parameters_filter>F</Parameters_filter>
<Parameters_pattern>[LIVMF]-G-E-x-[GAS]-[LIVM]-x(5,11)-R-[STAQ]-A-x-[LIVMA]-x-[STACV]</Parameters_pattern>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_74414</Iteration_query-ID>
<Iteration_query-def>unnamed protein product</Iteration_query-def>
<Iteration_query-len>664</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>ref|NP_001075863.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel [Oryctolagus cuniculus] &gt;emb|CAA42201.1| aorta CNG channel (rACNG) [Oryctolagus cuniculus] &gt;prf||1919268A cyclic nucleotide-gated channel [Oryctolagus cuniculus]</Hit_def>
<Hit_accession>NP_001075863</Hit_accession>
<Hit_len>732</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1290.65</Hsp_bit-score>
<Hsp_score>3336</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>69</Hsp_hit-from>
<Hsp_hit-to>732</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>664</Hsp_identity>
<Hsp_positive>664</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>ref|XP_051689802.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel isoform X3 [Oryctolagus cuniculus] &gt;sp|Q28718.1| RecName: Full=Cyclic nucleotide-gated olfactory channel; AltName: Full=Aorta CNG channel; Short=RACNG; AltName: Full=Cyclic nucleotide-gated cation channel 2; AltName: Full=Cyclic nucleotide-gated channel alpha-2; Short=CNG channel alpha-2; Short=CNG-2; Short=CNG2 [Oryctolagus cuniculus]</Hit_def>
<Hit_accession>XP_051689802</Hit_accession>
<Hit_len>664</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1290.65</Hsp_bit-score>
<Hsp_score>3336</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>664</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>664</Hsp_identity>
<Hsp_positive>664</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>ref|XP_017206345.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel isoform X2 [Oryctolagus cuniculus]</Hit_def>
<Hit_accession>XP_017206345</Hit_accession>
<Hit_len>677</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1290.65</Hsp_bit-score>
<Hsp_score>3336</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>14</Hsp_hit-from>
<Hsp_hit-to>677</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>664</Hsp_identity>
<Hsp_positive>664</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>4</Hit_num>
<Hit_id>ref|XP_051689801.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel isoform X1 [Oryctolagus cuniculus]</Hit_def>
<Hit_accession>XP_051689801</Hit_accession>
<Hit_len>687</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1290.65</Hsp_bit-score>
<Hsp_score>3336</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>24</Hsp_hit-from>
<Hsp_hit-to>687</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>664</Hsp_identity>
<Hsp_positive>664</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>5</Hit_num>
<Hit_id>ref|XP_004407164.1|</Hit_id>
<Hit_def>PREDICTED: cyclic nucleotide-gated olfactory channel [Odobenus rosmarus divergens]</Hit_def>
<Hit_accession>XP_004407164</Hit_accession>
<Hit_len>664</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1249.79</Hsp_bit-score>
<Hsp_score>3231</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>664</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>639</Hsp_identity>
<Hsp_positive>652</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNHHTPPAIKANGKDDHRTNSRPQSAADDDTSSELQRLAEMDAPQQGRGGFRRIVRLVGIIREWANKNFREEEPRPDSFLERFRGPELQTVTTQQGDGKGDKDGEGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYYLVWLVLDYFSDVVYITDLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHSPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKSVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLEQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLETKMKQNNMDDYLSDGVNSPEPTAADEP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHN+H P IKANGKD+ RT SRPQSAADDDTSSELQRLAEMDAPQQ RGGFRRIVRLVG+IR+WAN+NFREEE RPDSFLERFRGPELQTVTTQQGDGKGDKDG+GKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGY+LVWLVLDYFSDVVYI DLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIH+PELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKK+VDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKL+QLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLE KMKQN DDYLSDG+NSPEP AA++P</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>6</Hit_num>
<Hit_id>ref|XP_008688471.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel [Ursus maritimus] &gt;ref|XP_026343324.1| cyclic nucleotide-gated olfactory channel [Ursus arctos]</Hit_def>
<Hit_accession>XP_008688471</Hit_accession>
<Hit_len>664</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1248.63</Hsp_bit-score>
<Hsp_score>3228</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>664</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>638</Hsp_identity>
<Hsp_positive>652</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNHHAPPAIKANGKDDHRSSSRPQSAVDDDTSSELQRLAEMDAPQRGRGGFRRIVRLVGIIRDWANKNFREEEPRPDSFLERFRGPELQTVTTQQGDGKGDKDGEGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQKGYYLVWLVLDYFSDVVYITDLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHSPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKSVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLEQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLETKMKQNNEDDYLSDGMNSPEPAAADEP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHN+H P IKANGKD+ R+ SRPQSA DDDTSSELQRLAEMDAPQ+ RGGFRRIVRLVG+IR WAN+NFREEE RPDSFLERFRGPELQTVTTQQGDGKGDKDG+GKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQ+GY+LVWLVLDYFSDVVYI DLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIH+PELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKK+VDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKL+QLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLE KMKQN EDDYLSDGMNSPEPAAA++P</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>7</Hit_num>
<Hit_id>ref|XP_011229794.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel [Ailuropoda melanoleuca] &gt;gb|EFB14215.1| hypothetical protein PANDA_013994, partial [Ailuropoda melanoleuca]</Hit_def>
<Hit_accession>XP_011229794</Hit_accession>
<Hit_len>664</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1248.24</Hsp_bit-score>
<Hsp_score>3227</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>664</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>638</Hsp_identity>
<Hsp_positive>652</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNHHAPPAIKANGKDDHRSSSRPQSAVDDDTSSELQRLAEMDAPQRGRGGFRRIVRLVGIIRDWANKNFREEEPRPDSFLERFRGPELQTVTTQQGDGKGDKDGEGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQKGYYLVWLVLDYFSDVVYIIDLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHSPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKSVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLEQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLETKMKQNNEDDYLSDGMNSPEPAAADEP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHN+H P IKANGKD+ R+ SRPQSA DDDTSSELQRLAEMDAPQ+ RGGFRRIVRLVG+IR WAN+NFREEE RPDSFLERFRGPELQTVTTQQGDGKGDKDG+GKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQ+GY+LVWLVLDYFSDVVYI DLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIH+PELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKK+VDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKL+QLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLE KMKQN EDDYLSDGMNSPEPAAA++P</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>8</Hit_num>
<Hit_id>ref|XP_045646452.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel [Ursus americanus]</Hit_def>
<Hit_accession>XP_045646452</Hit_accession>
<Hit_len>664</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1246.68</Hsp_bit-score>
<Hsp_score>3223</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>664</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>637</Hsp_identity>
<Hsp_positive>651</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKCSPANNHNHHAPPAIKANGKDDHRSSSRPQSAVDDDTSSELQRLAEMDAPQRGRGGFRRIVRLVGIIRDWANKNFREEEPRPDSFLERFRGPELQTVTTQQGDGKGDKDGEGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQKGYYLVWLVLDYFSDVVYITDLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHSPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKSVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLEQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLETKMKQNNEDDYLSDGMNSPEPAAADEP</Hsp_hseq>
<Hsp_midline>MTEKSNGVK SPANNHN+H P IKANGKD+ R+ SRPQSA DDDTSSELQRLAEMDAPQ+ RGGFRRIVRLVG+IR WAN+NFREEE RPDSFLERFRGPELQTVTTQQGDGKGDKDG+GKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQ+GY+LVWLVLDYFSDVVYI DLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIH+PELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKK+VDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKL+QLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLE KMKQN EDDYLSDGMNSPEPAAA++P</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>9</Hit_num>
<Hit_id>ref|XP_035942617.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel [Halichoerus grypus]</Hit_def>
<Hit_accession>XP_035942617</Hit_accession>
<Hit_len>664</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1245.9</Hsp_bit-score>
<Hsp_score>3221</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>664</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>638</Hsp_identity>
<Hsp_positive>651</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNHHAPPVIKANGKDDHRTSSRPQSAADDDTSSELQRLAEMDVPQQGRGGFRRIVRLVGIIREWANKNFREEELRPDSFLERFRGPELQTVTTQQGDGKGDKDGEGKGTKKKFELFVLDPAGDWYYRWLFVIAMLVLYNWCLLVARACFSDLQKGYYLVWLVLDYFSDVVYITDLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHSPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKSVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLEQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLETKMKQNNMDDYLSDGMNSPEPAAADEP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHN+H P IKANGKD+ RT SRPQSAADDDTSSELQRLAEMD PQQ RGGFRRIVRLVG+IR+WAN+NFREEE RPDSFLERFRGPELQTVTTQQGDGKGDKDG+GKGTKKKFELFVLDPAGDWYYRWLFVIAM VLYNWCLLVARACFSDLQ+GY+LVWLVLDYFSDVVYI DLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIH+PELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKK+VDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKL+QLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLE KMKQN DDYLSDGMNSPEPAAA++P</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>10</Hit_num>
<Hit_id>ref|XP_049729369.1|</Hit_id>
<Hit_def>cyclic nucleotide-gated olfactory channel [Elephas maximus indicus]</Hit_def>
<Hit_accession>XP_049729369</Hit_accession>
<Hit_len>664</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1245.12</Hsp_bit-score>
<Hsp_score>3219</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>664</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>664</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>635</Hsp_identity>
<Hsp_positive>654</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>664</Hsp_align-len>
<Hsp_qseq>MTEKSNGVKSSPANNHNNHVPATIKANGKDESRTRSRPQSAADDDTSSELQRLAEMDAPQQRRGGFRRIVRLVGVIRQWANRNFREEEARPDSFLERFRGPELQTVTTQQGDGKGDKDGDGKGTKKKFELFVLDPAGDWYYRWLFVIAMPVLYNWCLLVARACFSDLQRGYFLVWLVLDYFSDVVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTLQFKLDVASIIPTDLIYFAVGIHNPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPEYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAASMEVDVQEKLKQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLEVKMKQNTEDDYLSDGMNSPEPAAAEQP</Hsp_qseq>
<Hsp_hseq>MTEKSNGVKSSPANNHNHHVPSTIKANGKDDRRTSSRPQSAADDDTSSELQRLAEMDAPQQWRGGFRRIIRLVGVIREWANKNFREEDPRPDSFLERFRGPELQTVTTQQGDGKSDKDGEGKGTKKKFELFVLDPAGDWYYRWLFFIALPVLYNWCLLVARACFSDLQKGYYLVWLVLDYFSDMVYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHTMQFKLDVASIIPTDLIYFAVGIHSPELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDPAYGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAATMEVDVQEKLEQLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLETKMKQNNEEDYLSDGINSPEPAAVEKP</Hsp_hseq>
<Hsp_midline>MTEKSNGVKSSPANNHN+HVP+TIKANGKD+ RT SRPQSAADDDTSSELQRLAEMDAPQQ RGGFRRI+RLVGVIR+WAN+NFREE+ RPDSFLERFRGPELQTVTTQQGDGK DKDG+GKGTKKKFELFVLDPAGDWYYRWLF IA+PVLYNWCLLVARACFSDLQ+GY+LVWLVLDYFSD+VYIADLFIRLRTGFLEQGLLVKDPKKLRDNYIHT+QFKLDVASIIPTDLIYFAVGIH+PELRFNRLLHFARMFEFFDRTETRTSYPNIFRISNLVLYILVIIHWNACIYYAISKSIGFGVDTWVYPNITDP YGYLAREYIYCLYWSTLTLTTIGETPPPVKDEEYLFVIFDFLIGVLIFATIVGNVGSMISNMNATRAEFQAKIDAVKHYMQFRKVSKEMEAKVIKWFDYLWTNKKTVDEREVLKNLPAKLRAEIAINVHLSTLKKVRIFQDCEAGLLVELVLKLRPQVFSPGDYICRKGDIGKEMYIIKEGKLAVVADDGVTQYALLSAGSCFGEISILNIKGSKMGNRRTANIRSLGYSDLFCLSKDDLMEAVTEYPDAKKVLEERGREILMKEGLLDENEVAA+MEVDVQEKL+QLETNMETLYTRFGRLLAEYTGAQQKLKQRITVLE KMKQN E+DYLSDG+NSPEPAA E+P</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>633473216</Statistics_db-num>
<Statistics_db-len>248084082182</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.047</Statistics_kappa>
<Statistics_lambda>0.27</Statistics_lambda>
<Statistics_entropy>1</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>

View File

@ -6,7 +6,7 @@
import unittest
from Bio.Blast.Record import HSP
from Bio.Blast.NCBIXML import HSP
class TestHsp(unittest.TestCase):

11484
Tests/test_Blast_parser.py Normal file

File diff suppressed because it is too large Load Diff