mirror of
https://github.com/biopython/biopython.git
synced 2025-10-21 23:38:51 +08:00
123 lines
4.7 KiB
Python
123 lines
4.7 KiB
Python
# Copyright 2008-2015 by Peter Cock. All rights reserved.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Bio.SeqIO support for the "ace" file format.
|
|
|
|
You are expected to use this module via the Bio.SeqIO functions.
|
|
See also the Bio.Sequencing.Ace module which offers more than just accessing
|
|
the contig consensus sequences in an ACE file as SeqRecord objects.
|
|
"""
|
|
|
|
from Bio.Seq import Seq
|
|
from Bio.SeqRecord import SeqRecord
|
|
from Bio.Sequencing import Ace
|
|
|
|
from .Interfaces import _TextIOSource
|
|
from .Interfaces import SequenceIterator
|
|
|
|
|
|
class AceIterator(SequenceIterator):
|
|
"""Return SeqRecord objects from an ACE file."""
|
|
|
|
modes = "t"
|
|
|
|
def __init__(
|
|
self,
|
|
source: _TextIOSource,
|
|
) -> None:
|
|
"""Iterate over SeqRecord objects read from an ACE file.
|
|
|
|
Arguments:
|
|
- source - input stream opened in text mode, or a path to a file
|
|
|
|
This uses the Bio.Sequencing.Ace module to do the hard work. Note that
|
|
by iterating over the file in a single pass, we are forced to ignore any
|
|
WA, CT, RT or WR footer tags.
|
|
|
|
Ace files include the base quality for each position, which are taken to
|
|
be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
|
|
using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
|
|
letter_annotations dictionary under the "phred_quality" key.
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> with open("Ace/consed_sample.ace") as handle:
|
|
... for record in SeqIO.parse(handle, "ace"):
|
|
... print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
|
|
... print(max(record.letter_annotations["phred_quality"]))
|
|
Contig1 agccccgggc... 1475
|
|
90
|
|
|
|
However, ACE files do not include a base quality for any gaps in the
|
|
consensus sequence, and these are represented in Biopython with quality
|
|
of zero. Using zero is perhaps misleading as there may be very strong
|
|
evidence to support the gap in the consensus. Previous versions of
|
|
Biopython therefore used None instead, but this complicated usage,
|
|
and prevented output of the gapped sequence as FASTQ format.
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> with open("Ace/contig1.ace") as handle:
|
|
... for record in SeqIO.parse(handle, "ace"):
|
|
... print("%s ...%s..." % (record.id, record.seq[85:95]))
|
|
... print(record.letter_annotations["phred_quality"][85:95])
|
|
... print(max(record.letter_annotations["phred_quality"]))
|
|
Contig1 ...AGAGG-ATGC...
|
|
[57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
|
|
90
|
|
Contig2 ...GAATTACTAT...
|
|
[68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
|
|
90
|
|
|
|
"""
|
|
super().__init__(source, fmt="ACE")
|
|
self.ace_contigs = Ace.parse(self.stream)
|
|
|
|
def __next__(self):
|
|
try:
|
|
ace_contig = next(self.ace_contigs)
|
|
except StopIteration:
|
|
raise StopIteration from None
|
|
# Convert the ACE contig record into a SeqRecord...
|
|
consensus_seq_str = ace_contig.sequence
|
|
if "*" in consensus_seq_str:
|
|
# For consistency with most other file formats, map
|
|
# any * gaps into - gaps.
|
|
assert "-" not in consensus_seq_str
|
|
consensus_seq = Seq(consensus_seq_str.replace("*", "-"))
|
|
else:
|
|
consensus_seq = Seq(consensus_seq_str)
|
|
|
|
# TODO? - Base segments (BS lines) which indicates which read
|
|
# phrap has chosen to be the consensus at a particular position.
|
|
# Perhaps as SeqFeature objects?
|
|
|
|
# TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
|
|
# Perhaps as SeqFeature objects?
|
|
|
|
seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name)
|
|
|
|
# Consensus base quality (BQ lines). Note that any gaps (originally
|
|
# as * characters) in the consensus do not get a quality entry, so
|
|
# we assign a quality of None (zero would be misleading as there may
|
|
# be excellent support for having a gap here).
|
|
quals = []
|
|
i = 0
|
|
for base in consensus_seq:
|
|
if base == "-":
|
|
quals.append(0)
|
|
else:
|
|
quals.append(ace_contig.quality[i])
|
|
i += 1
|
|
assert i == len(ace_contig.quality)
|
|
seq_record.letter_annotations["phred_quality"] = quals
|
|
|
|
return seq_record
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from Bio._utils import run_doctest
|
|
|
|
run_doctest()
|