mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
add psiblast test (#4889)
* update * update --------- Co-authored-by: Michiel de Hoon <mdehoon@madpc2s-MacBook-Pro.local>
This commit is contained in:
@ -30,6 +30,8 @@ from Bio.SeqFeature import SeqFeature
|
||||
from Bio.SeqFeature import SimpleLocation
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class DTDHandler:
|
||||
"""Parser for the BLAST XML DTD file."""
|
||||
@ -132,7 +134,6 @@ class SchemaHandler:
|
||||
"message",
|
||||
"subjects",
|
||||
"bl2seq",
|
||||
"iter-num",
|
||||
):
|
||||
pass # TBD
|
||||
else:
|
||||
@ -351,6 +352,9 @@ class XMLHandler:
|
||||
self._characters = ""
|
||||
|
||||
def _start_iteration(self, name, attributes):
|
||||
if self._program == "psiblast" and name == "http://www.ncbi.nlm.nih.gov Search":
|
||||
# PSIBLAST XML2 uses both <Iteration> and <Search>; ignore one
|
||||
return
|
||||
record = Record()
|
||||
self._record = record
|
||||
|
||||
@ -714,6 +718,9 @@ class XMLHandler:
|
||||
def _end_iteration(self, name):
|
||||
assert self._characters.strip() == ""
|
||||
self._characters = ""
|
||||
if self._program == "psiblast" and name == "http://www.ncbi.nlm.nih.gov Search":
|
||||
# PSIBLAST XML2 uses both <Iteration> and <Search>; ignore one
|
||||
return
|
||||
self._records._cache.append(self._record)
|
||||
del self._record
|
||||
|
||||
@ -926,106 +933,117 @@ class XMLHandler:
|
||||
hsp = self._hsp
|
||||
del self._hsp
|
||||
program = self._program
|
||||
align_len = hsp.align_len
|
||||
query = self._record.query
|
||||
if query is None:
|
||||
query = self._records.query
|
||||
query_id = query.id
|
||||
query_description = query.description
|
||||
query_length = len(query.seq)
|
||||
query_seq_aligned = hsp.qseq.encode()
|
||||
assert len(query_seq_aligned) == align_len
|
||||
target_seq_aligned = hsp.hseq.encode()
|
||||
assert len(target_seq_aligned) == align_len
|
||||
(target_seq_data, query_seq_data), coordinates = (
|
||||
Alignment.parse_printed_alignment([target_seq_aligned, query_seq_aligned])
|
||||
)
|
||||
query = SeqRecord(None, query_id, description=query_description)
|
||||
query_start = hsp.query_from - 1
|
||||
query_end = hsp.query_to
|
||||
if program in ("blastx", "tblastx"):
|
||||
assert query_end - query_start == 3 * len(query_seq_data)
|
||||
location = SimpleLocation(0, len(query_seq_data))
|
||||
coded_by = f"{query_id}:{hsp.query_from}..{hsp.query_to}"
|
||||
query_frame = hsp.query_frame
|
||||
if query_frame > 0:
|
||||
assert query_start % 3 == query_frame - 1
|
||||
elif query_frame < 0:
|
||||
assert (query_length - query_end) % 3 == -query_frame - 1
|
||||
coded_by = f"complement({coded_by})"
|
||||
qualifiers = {"coded_by": coded_by}
|
||||
feature = SeqFeature(location, type="CDS", qualifiers=qualifiers)
|
||||
query.features.append(feature)
|
||||
else:
|
||||
coordinates[1, :] += query_start
|
||||
assert query_end - query_start == len(query_seq_data)
|
||||
query_seq_data = {query_start: query_seq_data}
|
||||
if program == "blastn":
|
||||
try:
|
||||
query_strand = hsp.query_strand
|
||||
except AttributeError:
|
||||
# v1 XML
|
||||
pass
|
||||
else:
|
||||
# v2 XML
|
||||
assert query_strand == "Plus"
|
||||
query.seq = Seq(query_seq_data, query_length)
|
||||
target = self._alignments.target
|
||||
target_id = target.id
|
||||
target_name = target.name
|
||||
target_description = target.description
|
||||
target_length = len(target.seq)
|
||||
target = SeqRecord(None, target_id, target_name, description=target_description)
|
||||
if program in ("blastn", "megablast"):
|
||||
try:
|
||||
target_strand = hsp.hit_strand
|
||||
except AttributeError:
|
||||
# v1 XML
|
||||
target_frame = hsp.hit_frame
|
||||
if target_frame == +1:
|
||||
target_strand = "Plus"
|
||||
elif target_frame == -1:
|
||||
target_strand = "Minus"
|
||||
if target_strand == "Plus":
|
||||
query_seq_aligned = hsp.qseq.encode()
|
||||
target_seq_aligned = hsp.hseq.encode()
|
||||
try:
|
||||
align_len = hsp.align_len
|
||||
except AttributeError: # PSIBLAST XML2
|
||||
assert len(query_seq_aligned) == 0
|
||||
assert len(target_seq_aligned) == 0
|
||||
query_seq_data = None
|
||||
target.seq = Seq(None, target_length)
|
||||
coordinates = np.empty((2, 0), dtype=int)
|
||||
else:
|
||||
assert len(query_seq_aligned) == align_len
|
||||
assert len(target_seq_aligned) == align_len
|
||||
(
|
||||
target_seq_data,
|
||||
query_seq_data,
|
||||
), coordinates = Alignment.parse_printed_alignment(
|
||||
[target_seq_aligned, query_seq_aligned]
|
||||
)
|
||||
query_start = hsp.query_from - 1
|
||||
query_end = hsp.query_to
|
||||
if program in ("blastx", "tblastx"):
|
||||
assert query_end - query_start == 3 * len(query_seq_data)
|
||||
location = SimpleLocation(0, len(query_seq_data))
|
||||
coded_by = f"{query_id}:{hsp.query_from}..{hsp.query_to}"
|
||||
query_frame = hsp.query_frame
|
||||
if query_frame > 0:
|
||||
assert query_start % 3 == query_frame - 1
|
||||
elif query_frame < 0:
|
||||
assert (query_length - query_end) % 3 == -query_frame - 1
|
||||
coded_by = f"complement({coded_by})"
|
||||
qualifiers = {"coded_by": coded_by}
|
||||
feature = SeqFeature(location, type="CDS", qualifiers=qualifiers)
|
||||
query.features.append(feature)
|
||||
else:
|
||||
coordinates[1, :] += query_start
|
||||
assert query_end - query_start == len(query_seq_data)
|
||||
query_seq_data = {query_start: query_seq_data}
|
||||
if program == "blastn":
|
||||
try:
|
||||
query_strand = hsp.query_strand
|
||||
except AttributeError:
|
||||
# v1 XML
|
||||
pass
|
||||
else:
|
||||
# v2 XML
|
||||
assert query_strand == "Plus"
|
||||
if program in ("blastn", "megablast"):
|
||||
try:
|
||||
target_strand = hsp.hit_strand
|
||||
except AttributeError:
|
||||
# v1 XML
|
||||
target_frame = hsp.hit_frame
|
||||
if target_frame == +1:
|
||||
target_strand = "Plus"
|
||||
elif target_frame == -1:
|
||||
target_strand = "Minus"
|
||||
if target_strand == "Plus":
|
||||
target_start = hsp.hit_from - 1
|
||||
target_end = hsp.hit_to
|
||||
coordinates[0, :] += target_start
|
||||
assert target_end - target_start == len(target_seq_data)
|
||||
target_seq_data = {target_start: target_seq_data}
|
||||
target.seq = Seq(target_seq_data, target_length)
|
||||
elif target_strand == "Minus":
|
||||
target_start = hsp.hit_to - 1
|
||||
target_end = hsp.hit_from
|
||||
coordinates[0, :] = target_end - coordinates[0, :]
|
||||
assert target_end - target_start == len(target_seq_data)
|
||||
target_seq_data = {target_length - target_end: target_seq_data}
|
||||
seq = Seq(target_seq_data, target_length)
|
||||
target.seq = seq.reverse_complement()
|
||||
elif program in ("blastp", "blastx", "rpsblast", "psiblast"):
|
||||
target_start = hsp.hit_from - 1
|
||||
target_end = hsp.hit_to
|
||||
coordinates[0, :] += target_start
|
||||
assert target_end - target_start == len(target_seq_data)
|
||||
target_seq_data = {target_start: target_seq_data}
|
||||
target.seq = Seq(target_seq_data, target_length)
|
||||
elif target_strand == "Minus":
|
||||
target_start = hsp.hit_to - 1
|
||||
target_end = hsp.hit_from
|
||||
coordinates[0, :] = target_end - coordinates[0, :]
|
||||
assert target_end - target_start == len(target_seq_data)
|
||||
target_seq_data = {target_length - target_end: target_seq_data}
|
||||
seq = Seq(target_seq_data, target_length)
|
||||
target.seq = seq.reverse_complement()
|
||||
elif program in ("blastp", "blastx", "rpsblast", "psiblast"):
|
||||
target_start = hsp.hit_from - 1
|
||||
target_end = hsp.hit_to
|
||||
coordinates[0, :] += target_start
|
||||
assert target_end - target_start == len(target_seq_data)
|
||||
target_seq_data = {target_start: target_seq_data}
|
||||
target.seq = Seq(target_seq_data, target_length)
|
||||
elif program in ("tblastn", "tblastx"):
|
||||
target_start = hsp.hit_from - 1
|
||||
target_end = hsp.hit_to
|
||||
assert target_end - target_start == 3 * len(target_seq_data)
|
||||
location = SimpleLocation(0, target_length)
|
||||
coded_by = f"{target_id}:{hsp.hit_from}..{hsp.hit_to}"
|
||||
target_frame = hsp.hit_frame
|
||||
if target_frame >= 0:
|
||||
assert target_start % 3 == target_frame - 1
|
||||
elif target_frame < 0:
|
||||
assert (target_length - target_end) % 3 == -target_frame - 1
|
||||
coded_by = f"complement({coded_by})"
|
||||
qualifiers = {"coded_by": coded_by}
|
||||
feature = SeqFeature(location, type="CDS", qualifiers=qualifiers)
|
||||
target.features.append(feature)
|
||||
target.seq = Seq(target_seq_data, target_length)
|
||||
else:
|
||||
raise RuntimeError("Unexpected program name '%s'" % program)
|
||||
elif program in ("tblastn", "tblastx"):
|
||||
target_start = hsp.hit_from - 1
|
||||
target_end = hsp.hit_to
|
||||
assert target_end - target_start == 3 * len(target_seq_data)
|
||||
location = SimpleLocation(0, target_length)
|
||||
coded_by = f"{target_id}:{hsp.hit_from}..{hsp.hit_to}"
|
||||
target_frame = hsp.hit_frame
|
||||
if target_frame >= 0:
|
||||
assert target_start % 3 == target_frame - 1
|
||||
elif target_frame < 0:
|
||||
assert (target_length - target_end) % 3 == -target_frame - 1
|
||||
coded_by = f"complement({coded_by})"
|
||||
qualifiers = {"coded_by": coded_by}
|
||||
feature = SeqFeature(location, type="CDS", qualifiers=qualifiers)
|
||||
target.features.append(feature)
|
||||
target.seq = Seq(target_seq_data, target_length)
|
||||
else:
|
||||
raise RuntimeError("Unexpected program name '%s'" % program)
|
||||
query.seq = Seq(query_seq_data, query_length)
|
||||
sequences = [target, query]
|
||||
alignment = HSP(sequences, coordinates)
|
||||
alignment.num = hsp.num
|
||||
@ -1044,7 +1062,11 @@ class XMLHandler:
|
||||
except AttributeError:
|
||||
# missing in legacy megablast
|
||||
pass
|
||||
annotations["midline"] = hsp.midline
|
||||
try:
|
||||
annotations["midline"] = hsp.midline
|
||||
except AttributeError:
|
||||
# missing in psiblast for XML2
|
||||
pass
|
||||
alignment.annotations = annotations
|
||||
self._alignments.append(alignment)
|
||||
|
||||
|
@ -70,7 +70,6 @@ class BaseXMLWriter(ABC):
|
||||
|
||||
def _write_params(self, param):
|
||||
self._start_param()
|
||||
self.stream.write(b" <Parameters>\n")
|
||||
value = param.get("matrix")
|
||||
if value is not None:
|
||||
self._write_parameters_matrix(value.encode())
|
||||
@ -111,7 +110,6 @@ class BaseXMLWriter(ABC):
|
||||
value = param.get("bl2seq-mode")
|
||||
if value is not None:
|
||||
self._write_parameters_bl2seq_mode(value.encode())
|
||||
self.stream.write(b" </Parameters>\n")
|
||||
self._end_param()
|
||||
|
||||
def _write_records(self, records):
|
||||
@ -125,20 +123,16 @@ class BaseXMLWriter(ABC):
|
||||
|
||||
def _write_record(self, record):
|
||||
stream = self.stream
|
||||
self._start_iteration()
|
||||
try:
|
||||
num = record.num
|
||||
except AttributeError: # XML2
|
||||
pass
|
||||
else:
|
||||
self._write_iteration_num(num)
|
||||
self._start_iteration(record)
|
||||
query = record.query
|
||||
if query is None:
|
||||
query_length = None
|
||||
else:
|
||||
query_length = len(query.seq)
|
||||
self._write_iteration_query_id(query.id.encode())
|
||||
self._write_iteration_query_def(query.description.encode())
|
||||
description = query.description
|
||||
if description != "<unknown description>":
|
||||
self._write_iteration_query_def(description.encode())
|
||||
self._write_iteration_query_len(query_length)
|
||||
for feature in query.features:
|
||||
if feature.type == "masking":
|
||||
@ -198,76 +192,88 @@ class BaseXMLWriter(ABC):
|
||||
query = hsp.query
|
||||
target = hsp.target
|
||||
coordinates = hsp.coordinates
|
||||
hit_from, query_from = coordinates[:, 0]
|
||||
hit_to, query_to = coordinates[:, -1]
|
||||
if program in ("blastn", "megablast"):
|
||||
if hit_from <= hit_to:
|
||||
hit_frame = 1
|
||||
if coordinates.shape[1] > 0:
|
||||
hit_from, query_from = coordinates[:, 0]
|
||||
hit_to, query_to = coordinates[:, -1]
|
||||
if program in ("blastn", "megablast"):
|
||||
if hit_from <= hit_to:
|
||||
hit_frame = 1
|
||||
hit_from += 1
|
||||
else:
|
||||
hit_frame = -1
|
||||
hit_to += 1
|
||||
elif program in ("blastp", "blastx", "rpsblast", "psiblast"):
|
||||
hit_from += 1
|
||||
else:
|
||||
hit_frame = -1
|
||||
hit_to += 1
|
||||
elif program in ("blastp", "blastx", "rpsblast"):
|
||||
hit_from += 1
|
||||
hit_frame = 0
|
||||
elif program in ("tblastn", "tblastx"):
|
||||
feature = target.features[0]
|
||||
coded_by = feature.qualifiers["coded_by"]
|
||||
if coded_by.startswith("complement("):
|
||||
assert coded_by.endswith(")")
|
||||
coded_by = coded_by[11:-1]
|
||||
strand = -1
|
||||
else:
|
||||
strand = +1
|
||||
hit_id, hit_from_to = coded_by.split(":")
|
||||
hit_from, hit_to = hit_from_to.split("..")
|
||||
hit_from = int(hit_from)
|
||||
hit_to = int(hit_to)
|
||||
hit_start = hit_from - 1
|
||||
hit_end = hit_to
|
||||
if strand == +1:
|
||||
hit_frame = hit_start % 3 + 1
|
||||
else:
|
||||
hit_frame = (hit_end - target_length) % -3 - 1
|
||||
if program in ("blastn", "megablast"):
|
||||
if query_from <= query_to:
|
||||
hit_frame = 0
|
||||
elif program in ("tblastn", "tblastx"):
|
||||
feature = target.features[0]
|
||||
coded_by = feature.qualifiers["coded_by"]
|
||||
if coded_by.startswith("complement("):
|
||||
assert coded_by.endswith(")")
|
||||
coded_by = coded_by[11:-1]
|
||||
strand = -1
|
||||
else:
|
||||
strand = +1
|
||||
hit_id, hit_from_to = coded_by.split(":")
|
||||
hit_from, hit_to = hit_from_to.split("..")
|
||||
hit_from = int(hit_from)
|
||||
hit_to = int(hit_to)
|
||||
hit_start = hit_from - 1
|
||||
hit_end = hit_to
|
||||
if strand == +1:
|
||||
hit_frame = hit_start % 3 + 1
|
||||
else:
|
||||
hit_frame = (hit_end - target_length) % -3 - 1
|
||||
if program in ("blastn", "megablast"):
|
||||
if query_from <= query_to:
|
||||
query_from += 1
|
||||
query_frame = 1
|
||||
else:
|
||||
query_to += 1
|
||||
query_frame = -1
|
||||
elif program in ("blastp", "tblastn", "rpsblast", "psiblast"):
|
||||
query_from += 1
|
||||
query_frame = 1
|
||||
else:
|
||||
query_to += 1
|
||||
query_frame = -1
|
||||
elif program in ("blastp", "tblastn", "rpsblast"):
|
||||
query_from += 1
|
||||
query_frame = 0
|
||||
elif program in ("blastx", "tblastx"):
|
||||
feature = query.features[0]
|
||||
coded_by = feature.qualifiers["coded_by"]
|
||||
if coded_by.startswith("complement("):
|
||||
assert coded_by.endswith(")")
|
||||
coded_by = coded_by[11:-1]
|
||||
strand = -1
|
||||
else:
|
||||
strand = +1
|
||||
query_id, query_from_to = coded_by.split(":")
|
||||
query_from, query_to = query_from_to.split("..")
|
||||
query_from = int(query_from)
|
||||
query_to = int(query_to)
|
||||
query_start = query_from - 1
|
||||
query_end = query_to
|
||||
if strand == +1:
|
||||
query_frame = query_start % 3 + 1
|
||||
else:
|
||||
query_frame = (query_end - query_length) % -3 - 1
|
||||
hseq = hsp[0]
|
||||
qseq = hsp[1]
|
||||
align_len = len(hseq)
|
||||
query_frame = 0
|
||||
elif program in ("blastx", "tblastx"):
|
||||
feature = query.features[0]
|
||||
coded_by = feature.qualifiers["coded_by"]
|
||||
if coded_by.startswith("complement("):
|
||||
assert coded_by.endswith(")")
|
||||
coded_by = coded_by[11:-1]
|
||||
strand = -1
|
||||
else:
|
||||
strand = +1
|
||||
query_id, query_from_to = coded_by.split(":")
|
||||
query_from, query_to = query_from_to.split("..")
|
||||
query_from = int(query_from)
|
||||
query_to = int(query_to)
|
||||
query_start = query_from - 1
|
||||
query_end = query_to
|
||||
if strand == +1:
|
||||
query_frame = query_start % 3 + 1
|
||||
else:
|
||||
query_frame = (query_end - query_length) % -3 - 1
|
||||
hseq = hsp[0]
|
||||
qseq = hsp[1]
|
||||
align_len = len(hseq)
|
||||
else:
|
||||
# PSIBLAST XML2
|
||||
query_from = 0
|
||||
query_to = 0
|
||||
hit_from = 0
|
||||
hit_to = 0
|
||||
hseq = ""
|
||||
qseq = ""
|
||||
query_frame = None
|
||||
hit_frame = None
|
||||
align_len = None
|
||||
annotations = hsp.annotations
|
||||
bit_score = annotations["bit score"]
|
||||
evalue = annotations["evalue"]
|
||||
identity = annotations["identity"]
|
||||
positive = annotations.get("positive")
|
||||
gaps = annotations.get("gaps")
|
||||
midline = annotations["midline"]
|
||||
midline = annotations.get("midline")
|
||||
self._start_hsp()
|
||||
self._write_hsp_num(hsp.num)
|
||||
self._write_hsp_bit_score(str(bit_score).encode())
|
||||
@ -277,17 +283,21 @@ class BaseXMLWriter(ABC):
|
||||
self._write_hsp_query_to(query_to)
|
||||
self._write_hsp_hit_from(hit_from)
|
||||
self._write_hsp_hit_to(hit_to)
|
||||
self._write_hsp_query_frame(query_frame)
|
||||
self._write_hsp_hit_frame(hit_frame)
|
||||
if query_frame is not None:
|
||||
self._write_hsp_query_frame(query_frame)
|
||||
if hit_frame is not None:
|
||||
self._write_hsp_hit_frame(hit_frame)
|
||||
self._write_hsp_identity(identity)
|
||||
if positive is not None:
|
||||
self._write_hsp_positive(positive)
|
||||
if gaps is not None:
|
||||
self._write_hsp_gaps(gaps)
|
||||
self._write_hsp_align_len(align_len)
|
||||
if align_len is not None:
|
||||
self._write_hsp_align_len(align_len)
|
||||
self._write_hsp_qseq(qseq.encode())
|
||||
self._write_hsp_hseq(hseq.encode())
|
||||
self._write_hsp_midline(midline.encode())
|
||||
if midline is not None:
|
||||
self._write_hsp_midline(midline.encode())
|
||||
self._end_hsp()
|
||||
|
||||
def _write_statistics(self, stat):
|
||||
@ -645,10 +655,20 @@ class XMLWriter(BaseXMLWriter):
|
||||
)
|
||||
|
||||
def _start_param(self):
|
||||
self.stream.write(b" <BlastOutput_param>\n")
|
||||
self.stream.write(
|
||||
b"""\
|
||||
<BlastOutput_param>
|
||||
<Parameters>
|
||||
"""
|
||||
)
|
||||
|
||||
def _end_param(self):
|
||||
self.stream.write(b" </BlastOutput_param>\n")
|
||||
self.stream.write(
|
||||
b"""\
|
||||
</Parameters>
|
||||
</BlastOutput_param>
|
||||
"""
|
||||
)
|
||||
|
||||
def _start_iterations(self):
|
||||
self.stream.write(b"<BlastOutput_iterations>\n")
|
||||
@ -662,8 +682,9 @@ class XMLWriter(BaseXMLWriter):
|
||||
def _end_mbstat(self):
|
||||
self.stream.write(b" </BlastOutput_mbstat>\n")
|
||||
|
||||
def _start_iteration(self):
|
||||
def _start_iteration(self, record):
|
||||
self.stream.write(b"<Iteration>\n")
|
||||
self._write_iteration_num(record.num)
|
||||
|
||||
def _end_iteration(self):
|
||||
self.stream.write(b"</Iteration>\n")
|
||||
@ -709,35 +730,35 @@ class XMLWriter(BaseXMLWriter):
|
||||
|
||||
def _write_parameters_sc_match(self, value):
|
||||
self.stream.write(
|
||||
b" <Parameters_sc-match>%d</Parameters_sc-match>\n" % value
|
||||
b" <Parameters_sc-match>%d</Parameters_sc-match>\n" % value
|
||||
)
|
||||
|
||||
def _write_parameters_sc_mismatch(self, value):
|
||||
self.stream.write(
|
||||
b" <Parameters_sc-mismatch>%d</Parameters_sc-mismatch>\n" % value
|
||||
b" <Parameters_sc-mismatch>%d</Parameters_sc-mismatch>\n" % value
|
||||
)
|
||||
|
||||
def _write_parameters_gap_open(self, value):
|
||||
self.stream.write(
|
||||
b" <Parameters_gap-open>%d</Parameters_gap-open>\n" % value
|
||||
b" <Parameters_gap-open>%d</Parameters_gap-open>\n" % value
|
||||
)
|
||||
|
||||
def _write_parameters_gap_extend(self, value):
|
||||
self.stream.write(
|
||||
b" <Parameters_gap-extend>%d</Parameters_gap-extend>\n" % value
|
||||
b" <Parameters_gap-extend>%d</Parameters_gap-extend>\n" % value
|
||||
)
|
||||
|
||||
def _write_parameters_filter(self, value):
|
||||
self.stream.write(b" <Parameters_filter>%b</Parameters_filter>\n" % value)
|
||||
self.stream.write(b" <Parameters_filter>%b</Parameters_filter>\n" % value)
|
||||
|
||||
def _write_parameters_pattern(self, value):
|
||||
self.stream.write(
|
||||
b" <Parameters_pattern>%b</Parameters_pattern>\n" % value
|
||||
b" <Parameters_pattern>%b</Parameters_pattern>\n" % value
|
||||
)
|
||||
|
||||
def _write_parameters_entrez_query(self, value):
|
||||
self.stream.write(
|
||||
b" <Parameters_entrez-query>%b</Parameters_entrez-query>\n" % value
|
||||
b" <Parameters_entrez-query>%b</Parameters_entrez-query>\n" % value
|
||||
)
|
||||
|
||||
def _write_statistics_db_num(self, db_num):
|
||||
@ -943,7 +964,18 @@ class XML2Writer(BaseXMLWriter):
|
||||
"""
|
||||
)
|
||||
|
||||
def _start_iteration(self):
|
||||
def _start_iteration(self, record):
|
||||
try:
|
||||
num = record.num
|
||||
except AttributeError:
|
||||
pass
|
||||
else: # PSIBLAST
|
||||
self.stream.write(
|
||||
b"""\
|
||||
<Iteration>
|
||||
"""
|
||||
)
|
||||
self._write_iteration_num(num)
|
||||
self.stream.write(
|
||||
b"""\
|
||||
<search>
|
||||
@ -958,6 +990,15 @@ class XML2Writer(BaseXMLWriter):
|
||||
</search>
|
||||
"""
|
||||
)
|
||||
if self._program == "psiblast":
|
||||
self.stream.write(
|
||||
b"""\
|
||||
</Iteration>
|
||||
"""
|
||||
)
|
||||
|
||||
def _write_iteration_num(self, num):
|
||||
self.stream.write(b" <iter-num>%d</iter-num>\n" % num)
|
||||
|
||||
def _write_iteration_query_id(self, query_id):
|
||||
self.stream.write(
|
||||
|
137
Tests/Blast/xml2_21500_psiblast_001.xml
Normal file
137
Tests/Blast/xml2_21500_psiblast_001.xml
Normal file
@ -0,0 +1,137 @@
|
||||
<?xml version="1.0"?>
|
||||
<BlastXML2
|
||||
xmlns="http://www.ncbi.nlm.nih.gov"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xs:schemaLocation="http://www.ncbi.nlm.nih.gov http://www.ncbi.nlm.nih.gov/data_specs/schema_alt/NCBI_BlastOutput2.xsd"
|
||||
>
|
||||
<BlastOutput2>
|
||||
<report>
|
||||
<Report>
|
||||
<program>psiblast</program>
|
||||
<version>PSIBLAST 2.15.0+</version>
|
||||
<reference>Alejandro A. Sch&auml;ffer, L. Aravind, Thomas L. Madden, Sergei Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements", Nucleic Acids Res. 29:2994-3005.</reference>
|
||||
<search-target>
|
||||
<Target>
|
||||
<db>swissprot</db>
|
||||
</Target>
|
||||
</search-target>
|
||||
<params>
|
||||
<Parameters>
|
||||
<matrix>BLOSUM62</matrix>
|
||||
<expect>1e-30</expect>
|
||||
<gap-open>11</gap-open>
|
||||
<gap-extend>1</gap-extend>
|
||||
<filter>F</filter>
|
||||
<cbs>2</cbs>
|
||||
</Parameters>
|
||||
</params>
|
||||
<results>
|
||||
<Results>
|
||||
<iterations>
|
||||
<Iteration>
|
||||
<iter-num>1</iter-num>
|
||||
<search>
|
||||
<Search>
|
||||
<query-id>lcl|Query_1</query-id>
|
||||
<query-len>103</query-len>
|
||||
<hits>
|
||||
<Hit>
|
||||
<num>1</num>
|
||||
<description>
|
||||
<HitDescr>
|
||||
<id>sp|P69428.1|</id>
|
||||
<accession>P69428</accession>
|
||||
<title>RecName: Full=Sec-independent protein translocase protein TatA [Escherichia coli K-12]</title>
|
||||
<taxid>83333</taxid>
|
||||
</HitDescr>
|
||||
<HitDescr>
|
||||
<id>sp|P69429.1|</id>
|
||||
<accession>P69429</accession>
|
||||
<title>RecName: Full=Sec-independent protein translocase protein TatA [Escherichia coli CFT073]</title>
|
||||
<taxid>199310</taxid>
|
||||
</HitDescr>
|
||||
<HitDescr>
|
||||
<id>sp|P69430.1|</id>
|
||||
<accession>P69430</accession>
|
||||
<title>RecName: Full=Sec-independent protein translocase protein TatA [Escherichia coli O157:H7]</title>
|
||||
<taxid>83334</taxid>
|
||||
</HitDescr>
|
||||
<HitDescr>
|
||||
<id>sp|P69431.1|</id>
|
||||
<accession>P69431</accession>
|
||||
<title>RecName: Full=Sec-independent protein translocase protein TatA [Shigella flexneri]</title>
|
||||
<taxid>623</taxid>
|
||||
</HitDescr>
|
||||
</description>
|
||||
<len>89</len>
|
||||
<hsps>
|
||||
<Hsp>
|
||||
<num>1</num>
|
||||
<bit-score>177.178</bit-score>
|
||||
<score>448</score>
|
||||
<evalue>2.3039e-58</evalue>
|
||||
<identity>89</identity>
|
||||
<query-from>0</query-from>
|
||||
<query-to>0</query-to>
|
||||
<hit-from>0</hit-from>
|
||||
<hit-to>0</hit-to>
|
||||
<qseq></qseq>
|
||||
<hseq></hseq>
|
||||
</Hsp>
|
||||
</hsps>
|
||||
</Hit>
|
||||
<Hit>
|
||||
<num>2</num>
|
||||
<description>
|
||||
<HitDescr>
|
||||
<id>sp|P0A2H3.1|</id>
|
||||
<accession>P0A2H3</accession>
|
||||
<title>RecName: Full=Sec-independent protein translocase protein TatA [Salmonella enterica subsp. enterica serovar Typhimurium str. LT2]</title>
|
||||
<taxid>99287</taxid>
|
||||
</HitDescr>
|
||||
<HitDescr>
|
||||
<id>sp|P0A2H4.1|</id>
|
||||
<accession>P0A2H4</accession>
|
||||
<title>RecName: Full=Sec-independent protein translocase protein TatA [Salmonella enterica subsp. enterica serovar Typhi]</title>
|
||||
<taxid>90370</taxid>
|
||||
</HitDescr>
|
||||
</description>
|
||||
<len>84</len>
|
||||
<hsps>
|
||||
<Hsp>
|
||||
<num>1</num>
|
||||
<bit-score>142.51</bit-score>
|
||||
<score>358</score>
|
||||
<evalue>1.0691e-44</evalue>
|
||||
<identity>75</identity>
|
||||
<query-from>0</query-from>
|
||||
<query-to>0</query-to>
|
||||
<hit-from>0</hit-from>
|
||||
<hit-to>0</hit-to>
|
||||
<qseq></qseq>
|
||||
<hseq></hseq>
|
||||
</Hsp>
|
||||
</hsps>
|
||||
</Hit>
|
||||
</hits>
|
||||
<stat>
|
||||
<Statistics>
|
||||
<db-num>482816</db-num>
|
||||
<db-len>183558113</db-len>
|
||||
<hsp-len>72</hsp-len>
|
||||
<eff-space>4627826878</eff-space>
|
||||
<kappa>0.041</kappa>
|
||||
<lambda>0.267</lambda>
|
||||
<entropy>0.14</entropy>
|
||||
</Statistics>
|
||||
</stat>
|
||||
</Search>
|
||||
</search>
|
||||
</Iteration>
|
||||
</iterations>
|
||||
</Results>
|
||||
</results>
|
||||
</Report>
|
||||
</report>
|
||||
</BlastOutput2>
|
||||
</BlastXML2>
|
98
Tests/Blast/xml_21500_psiblast_001.xml
Normal file
98
Tests/Blast/xml_21500_psiblast_001.xml
Normal file
@ -0,0 +1,98 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
|
||||
<BlastOutput>
|
||||
<BlastOutput_program>psiblast</BlastOutput_program>
|
||||
<BlastOutput_version>PSIBLAST 2.15.0+</BlastOutput_version>
|
||||
<BlastOutput_reference>Alejandro A. Sch&auml;ffer, L. Aravind, Thomas L. Madden, Sergei Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements", Nucleic Acids Res. 29:2994-3005.</BlastOutput_reference>
|
||||
<BlastOutput_db>swissprot</BlastOutput_db>
|
||||
<BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
|
||||
<BlastOutput_query-def>WP_001234791.1 Sec-independent protein translocase subunit TatA [Shigella flexneri]</BlastOutput_query-def>
|
||||
<BlastOutput_query-len>103</BlastOutput_query-len>
|
||||
<BlastOutput_param>
|
||||
<Parameters>
|
||||
<Parameters_matrix>BLOSUM62</Parameters_matrix>
|
||||
<Parameters_expect>1e-30</Parameters_expect>
|
||||
<Parameters_gap-open>11</Parameters_gap-open>
|
||||
<Parameters_gap-extend>1</Parameters_gap-extend>
|
||||
<Parameters_filter>F</Parameters_filter>
|
||||
</Parameters>
|
||||
</BlastOutput_param>
|
||||
<BlastOutput_iterations>
|
||||
<Iteration>
|
||||
<Iteration_iter-num>1</Iteration_iter-num>
|
||||
<Iteration_query-ID>Query_1</Iteration_query-ID>
|
||||
<Iteration_query-def>WP_001234791.1 Sec-independent protein translocase subunit TatA [Shigella flexneri]</Iteration_query-def>
|
||||
<Iteration_query-len>103</Iteration_query-len>
|
||||
<Iteration_hits>
|
||||
<Hit>
|
||||
<Hit_num>1</Hit_num>
|
||||
<Hit_id>sp|P69428.1|</Hit_id>
|
||||
<Hit_def>RecName: Full=Sec-independent protein translocase protein TatA [Escherichia coli K-12] >sp|P69429.1| RecName: Full=Sec-independent protein translocase protein TatA [Escherichia coli CFT073] >sp|P69430.1| RecName: Full=Sec-independent protein translocase protein TatA [Escherichia coli O157:H7] >sp|P69431.1| RecName: Full=Sec-independent protein translocase protein TatA [Shigella flexneri]</Hit_def>
|
||||
<Hit_accession>P69428</Hit_accession>
|
||||
<Hit_len>89</Hit_len>
|
||||
<Hit_hsps>
|
||||
<Hsp>
|
||||
<Hsp_num>1</Hsp_num>
|
||||
<Hsp_bit-score>177.178</Hsp_bit-score>
|
||||
<Hsp_score>448</Hsp_score>
|
||||
<Hsp_evalue>2.3039e-58</Hsp_evalue>
|
||||
<Hsp_query-from>15</Hsp_query-from>
|
||||
<Hsp_query-to>103</Hsp_query-to>
|
||||
<Hsp_hit-from>1</Hsp_hit-from>
|
||||
<Hsp_hit-to>89</Hsp_hit-to>
|
||||
<Hsp_query-frame>0</Hsp_query-frame>
|
||||
<Hsp_hit-frame>0</Hsp_hit-frame>
|
||||
<Hsp_identity>89</Hsp_identity>
|
||||
<Hsp_positive>89</Hsp_positive>
|
||||
<Hsp_gaps>0</Hsp_gaps>
|
||||
<Hsp_align-len>89</Hsp_align-len>
|
||||
<Hsp_qseq>MGGISIWQLLIIAVIVVLLFGTKKLGSIGSDLGASIKGFKKAMSDDEPKQDKTSQDADFTAKTIADKQADTNQEQAKTEDAKRHDKEQV</Hsp_qseq>
|
||||
<Hsp_hseq>MGGISIWQLLIIAVIVVLLFGTKKLGSIGSDLGASIKGFKKAMSDDEPKQDKTSQDADFTAKTIADKQADTNQEQAKTEDAKRHDKEQV</Hsp_hseq>
|
||||
<Hsp_midline>MGGISIWQLLIIAVIVVLLFGTKKLGSIGSDLGASIKGFKKAMSDDEPKQDKTSQDADFTAKTIADKQADTNQEQAKTEDAKRHDKEQV</Hsp_midline>
|
||||
</Hsp>
|
||||
</Hit_hsps>
|
||||
</Hit>
|
||||
<Hit>
|
||||
<Hit_num>2</Hit_num>
|
||||
<Hit_id>sp|P0A2H3.1|</Hit_id>
|
||||
<Hit_def>RecName: Full=Sec-independent protein translocase protein TatA [Salmonella enterica subsp. enterica serovar Typhimurium str. LT2] >sp|P0A2H4.1| RecName: Full=Sec-independent protein translocase protein TatA [Salmonella enterica subsp. enterica serovar Typhi]</Hit_def>
|
||||
<Hit_accession>P0A2H3</Hit_accession>
|
||||
<Hit_len>84</Hit_len>
|
||||
<Hit_hsps>
|
||||
<Hsp>
|
||||
<Hsp_num>1</Hsp_num>
|
||||
<Hsp_bit-score>142.51</Hsp_bit-score>
|
||||
<Hsp_score>358</Hsp_score>
|
||||
<Hsp_evalue>1.0691e-44</Hsp_evalue>
|
||||
<Hsp_query-from>15</Hsp_query-from>
|
||||
<Hsp_query-to>103</Hsp_query-to>
|
||||
<Hsp_hit-from>1</Hsp_hit-from>
|
||||
<Hsp_hit-to>84</Hsp_hit-to>
|
||||
<Hsp_query-frame>0</Hsp_query-frame>
|
||||
<Hsp_hit-frame>0</Hsp_hit-frame>
|
||||
<Hsp_identity>75</Hsp_identity>
|
||||
<Hsp_positive>79</Hsp_positive>
|
||||
<Hsp_gaps>5</Hsp_gaps>
|
||||
<Hsp_align-len>89</Hsp_align-len>
|
||||
<Hsp_qseq>MGGISIWQLLIIAVIVVLLFGTKKLGSIGSDLGASIKGFKKAMSDDEPKQDKTSQDADFTAKTIADKQADTNQEQAKTEDAKRHDKEQV</Hsp_qseq>
|
||||
<Hsp_hseq>MGGISIWQLLIVAVIVVLLFGTKKLGSIGSDLGASIKGFKKAMSDDDAKQDKTSQDADFTAKSIADKQG-----EAKKEDAKSQDKEQV</Hsp_hseq>
|
||||
<Hsp_midline>MGGISIWQLLI+AVIVVLLFGTKKLGSIGSDLGASIKGFKKAMSDD+ KQDKTSQDADFTAK+IADKQ +AK EDAK DKEQV</Hsp_midline>
|
||||
</Hsp>
|
||||
</Hit_hsps>
|
||||
</Hit>
|
||||
</Iteration_hits>
|
||||
<Iteration_stat>
|
||||
<Statistics>
|
||||
<Statistics_db-num>482816</Statistics_db-num>
|
||||
<Statistics_db-len>183558113</Statistics_db-len>
|
||||
<Statistics_hsp-len>72</Statistics_hsp-len>
|
||||
<Statistics_eff-space>4627826878</Statistics_eff-space>
|
||||
<Statistics_kappa>0.041</Statistics_kappa>
|
||||
<Statistics_lambda>0.267</Statistics_lambda>
|
||||
<Statistics_entropy>0.14</Statistics_entropy>
|
||||
</Statistics>
|
||||
</Iteration_stat>
|
||||
</Iteration>
|
||||
</BlastOutput_iterations>
|
||||
</BlastOutput>
|
||||
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user