mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
1210 lines
43 KiB
Python
1210 lines
43 KiB
Python
# Copyright 2000 by Jeffrey Chang, Brad Chapman. All rights reserved.
|
|
# Copyright 2006-2017 by Peter Cock. All rights reserved.
|
|
#
|
|
# This code is part of the Biopython distribution and governed by its
|
|
# license. Please see the LICENSE file that should have been included
|
|
# as part of this package.
|
|
|
|
"""Code to work with GenBank formatted files.
|
|
|
|
Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
|
|
the "genbank" or "embl" format names to parse GenBank or EMBL files into
|
|
SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
|
|
|
|
Using Bio.GenBank directly to parse GenBank files is only useful if you want
|
|
to obtain GenBank-specific Record objects, which is a much closer
|
|
representation to the raw file contents than the SeqRecord alternative from
|
|
the FeatureParser (used in Bio.SeqIO).
|
|
|
|
To use the Bio.GenBank parser, there are two helper functions:
|
|
|
|
- read Parse a handle containing a single GenBank record
|
|
as Bio.GenBank specific Record objects.
|
|
- parse Iterate over a handle containing multiple GenBank
|
|
records as Bio.GenBank specific Record objects.
|
|
|
|
The following internal classes are not intended for direct use and may
|
|
be deprecated in a future release.
|
|
|
|
Classes:
|
|
- Iterator Iterate through a file of GenBank entries
|
|
- FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
|
|
- RecordParser Parse GenBank data into a Record object.
|
|
|
|
Exceptions:
|
|
- ParserFailureError Exception indicating a failure in the parser (ie.
|
|
scanner or consumer)
|
|
|
|
"""
|
|
|
|
import re
|
|
import warnings
|
|
|
|
from Bio import BiopythonParserWarning
|
|
from Bio.Seq import Seq
|
|
from Bio.SeqFeature import Location
|
|
from Bio.SeqFeature import LocationParserError
|
|
from Bio.SeqFeature import Reference
|
|
from Bio.SeqFeature import SeqFeature
|
|
from Bio.SeqFeature import SimpleLocation
|
|
|
|
from .Scanner import GenBankScanner
|
|
|
|
# other Bio.GenBank stuff
|
|
from .utils import FeatureValueCleaner
|
|
|
|
# Constants used to parse GenBank header lines
|
|
GENBANK_INDENT = 12
|
|
GENBANK_SPACER = " " * GENBANK_INDENT
|
|
|
|
# Constants for parsing GenBank feature lines
|
|
FEATURE_KEY_INDENT = 5
|
|
FEATURE_QUALIFIER_INDENT = 21
|
|
FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
|
|
FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
|
|
|
|
|
|
class Iterator:
|
|
"""Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE).
|
|
|
|
This class is likely to be deprecated in a future release of Biopython.
|
|
Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...)
|
|
for SeqRecord and GenBank specific Record objects respectively instead.
|
|
"""
|
|
|
|
def __init__(self, handle, parser=None):
|
|
"""Initialize the iterator.
|
|
|
|
Arguments:
|
|
- handle - A handle with GenBank entries to iterate through.
|
|
- parser - An optional parser to pass the entries through before
|
|
returning them. If None, then the raw entry will be returned.
|
|
|
|
"""
|
|
self.handle = handle
|
|
self._parser = parser
|
|
|
|
def __next__(self):
|
|
"""Return the next GenBank record from the handle.
|
|
|
|
Will return None if we ran out of records.
|
|
"""
|
|
if self._parser is None:
|
|
lines = []
|
|
while True:
|
|
line = self.handle.readline()
|
|
if not line:
|
|
return None # Premature end of file?
|
|
lines.append(line)
|
|
if line.rstrip() == "//":
|
|
break
|
|
return "".join(lines)
|
|
try:
|
|
return self._parser.parse(self.handle)
|
|
except StopIteration:
|
|
return None
|
|
|
|
def __iter__(self):
|
|
"""Iterate over the records."""
|
|
return iter(self.__next__, None)
|
|
|
|
|
|
class ParserFailureError(ValueError):
|
|
"""Failure caused by some kind of problem in the parser."""
|
|
|
|
|
|
_cleaner = FeatureValueCleaner()
|
|
|
|
|
|
class FeatureParser:
|
|
"""Parse GenBank files into Seq + Feature objects (OBSOLETE).
|
|
|
|
Direct use of this class is discouraged, and may be deprecated in
|
|
a future release of Biopython.
|
|
|
|
Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead.
|
|
"""
|
|
|
|
def __init__(self, debug_level=0, use_fuzziness=1, feature_cleaner=None):
|
|
"""Initialize a GenBank parser and Feature consumer.
|
|
|
|
Arguments:
|
|
- debug_level - An optional argument that species the amount of
|
|
debugging information the parser should spit out. By default we have
|
|
no debugging info (the fastest way to do things), but if you want
|
|
you can set this as high as two and see exactly where a parse fails.
|
|
- use_fuzziness - Specify whether or not to use fuzzy representations.
|
|
The default is 1 (use fuzziness).
|
|
- feature_cleaner - A class which will be used to clean out the
|
|
values of features. This class must implement the function
|
|
clean_value. GenBank.utils has a "standard" cleaner class, which
|
|
is used by default.
|
|
|
|
"""
|
|
self._scanner = GenBankScanner(debug_level)
|
|
self.use_fuzziness = use_fuzziness
|
|
if feature_cleaner:
|
|
self._cleaner = feature_cleaner
|
|
else:
|
|
self._cleaner = _cleaner # default
|
|
|
|
def parse(self, handle):
|
|
"""Parse the specified handle."""
|
|
_consumer = _FeatureConsumer(self.use_fuzziness, self._cleaner)
|
|
self._scanner.feed(handle, _consumer)
|
|
return _consumer.data
|
|
|
|
|
|
class RecordParser:
|
|
"""Parse GenBank files into Record objects (OBSOLETE).
|
|
|
|
Direct use of this class is discouraged, and may be deprecated in
|
|
a future release of Biopython.
|
|
|
|
Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions
|
|
instead.
|
|
"""
|
|
|
|
def __init__(self, debug_level=0):
|
|
"""Initialize the parser.
|
|
|
|
Arguments:
|
|
- debug_level - An optional argument that species the amount of
|
|
debugging information the parser should spit out. By default we have
|
|
no debugging info (the fastest way to do things), but if you want
|
|
you can set this as high as two and see exactly where a parse fails.
|
|
|
|
"""
|
|
self._scanner = GenBankScanner(debug_level)
|
|
|
|
def parse(self, handle):
|
|
"""Parse the specified handle into a GenBank record."""
|
|
_consumer = _RecordConsumer()
|
|
|
|
self._scanner.feed(handle, _consumer)
|
|
return _consumer.data
|
|
|
|
|
|
class _BaseGenBankConsumer:
|
|
"""Abstract GenBank consumer providing useful general functions (PRIVATE).
|
|
|
|
This just helps to eliminate some duplication in things that most
|
|
GenBank consumers want to do.
|
|
"""
|
|
|
|
# Special keys in GenBank records that we should remove spaces from
|
|
# For instance, \translation keys have values which are proteins and
|
|
# should have spaces and newlines removed from them. This class
|
|
# attribute gives us more control over specific formatting problems.
|
|
remove_space_keys = ["translation"]
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
@staticmethod
|
|
def _split_keywords(keyword_string):
|
|
"""Split a string of keywords into a nice clean list (PRIVATE)."""
|
|
# process the keywords into a python list
|
|
if keyword_string == "" or keyword_string == ".":
|
|
keywords = ""
|
|
elif keyword_string[-1] == ".":
|
|
keywords = keyword_string[:-1]
|
|
else:
|
|
keywords = keyword_string
|
|
keyword_list = keywords.split(";")
|
|
return [x.strip() for x in keyword_list]
|
|
|
|
@staticmethod
|
|
def _split_accessions(accession_string):
|
|
"""Split a string of accession numbers into a list (PRIVATE)."""
|
|
# first replace all line feeds with spaces
|
|
# Also, EMBL style accessions are split with ';'
|
|
accession = accession_string.replace("\n", " ").replace(";", " ")
|
|
|
|
return [x.strip() for x in accession.split() if x.strip()]
|
|
|
|
@staticmethod
|
|
def _split_taxonomy(taxonomy_string):
|
|
"""Split a string with taxonomy info into a list (PRIVATE)."""
|
|
if not taxonomy_string or taxonomy_string == ".":
|
|
# Missing data, no taxonomy
|
|
return []
|
|
|
|
if taxonomy_string[-1] == ".":
|
|
tax_info = taxonomy_string[:-1]
|
|
else:
|
|
tax_info = taxonomy_string
|
|
tax_list = tax_info.split(";")
|
|
new_tax_list = []
|
|
for tax_item in tax_list:
|
|
new_items = tax_item.split("\n")
|
|
new_tax_list.extend(new_items)
|
|
while "" in new_tax_list:
|
|
new_tax_list.remove("")
|
|
return [x.strip() for x in new_tax_list]
|
|
|
|
@staticmethod
|
|
def _clean_location(location_string):
|
|
"""Clean whitespace out of a location string (PRIVATE).
|
|
|
|
The location parser isn't a fan of whitespace, so we clean it out
|
|
before feeding it into the parser.
|
|
"""
|
|
# Originally this imported string.whitespace and did a replace
|
|
# via a loop. It's simpler to just split on whitespace and rejoin
|
|
# the string - and this avoids importing string too. See Bug 2684.
|
|
return "".join(location_string.split())
|
|
|
|
@staticmethod
|
|
def _remove_newlines(text):
|
|
"""Remove any newlines in the passed text, returning the new string (PRIVATE)."""
|
|
# get rid of newlines in the qualifier value
|
|
newlines = ["\n", "\r"]
|
|
for ws in newlines:
|
|
text = text.replace(ws, "")
|
|
|
|
return text
|
|
|
|
@staticmethod
|
|
def _normalize_spaces(text):
|
|
"""Replace multiple spaces in the passed text with single spaces (PRIVATE)."""
|
|
# get rid of excessive spaces
|
|
return " ".join(x for x in text.split(" ") if x)
|
|
|
|
@staticmethod
|
|
def _remove_spaces(text):
|
|
"""Remove all spaces from the passed text (PRIVATE)."""
|
|
return text.replace(" ", "")
|
|
|
|
@staticmethod
|
|
def _convert_to_python_numbers(start, end):
|
|
"""Convert a start and end range to python notation (PRIVATE).
|
|
|
|
In GenBank, starts and ends are defined in "biological" coordinates,
|
|
where 1 is the first base and [i, j] means to include both i and j.
|
|
|
|
In python, 0 is the first base and [i, j] means to include i, but
|
|
not j.
|
|
|
|
So, to convert "biological" to python coordinates, we need to
|
|
subtract 1 from the start, and leave the end and things should
|
|
be converted happily.
|
|
"""
|
|
new_start = start - 1
|
|
new_end = end
|
|
|
|
return new_start, new_end
|
|
|
|
|
|
class _FeatureConsumer(_BaseGenBankConsumer):
|
|
"""Create a SeqRecord object with Features to return (PRIVATE).
|
|
|
|
Attributes:
|
|
- use_fuzziness - specify whether or not to parse with fuzziness in
|
|
feature locations.
|
|
- feature_cleaner - a class that will be used to provide specialized
|
|
cleaning-up of feature values.
|
|
|
|
"""
|
|
|
|
def __init__(self, use_fuzziness, feature_cleaner=None):
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
_BaseGenBankConsumer.__init__(self)
|
|
self.data = SeqRecord(None, id=None)
|
|
self.data.id = None
|
|
self.data.description = ""
|
|
|
|
self._use_fuzziness = use_fuzziness
|
|
self._feature_cleaner = feature_cleaner
|
|
|
|
self._seq_type = ""
|
|
self._seq_data = []
|
|
self._cur_reference = None
|
|
self._cur_feature = None
|
|
self._expected_size = None
|
|
|
|
def locus(self, locus_name):
|
|
"""Set the locus name is set as the name of the Sequence."""
|
|
self.data.name = locus_name
|
|
|
|
def size(self, content):
|
|
"""Record the sequence length."""
|
|
self._expected_size = int(content)
|
|
|
|
def residue_type(self, type):
|
|
"""Record the sequence type (SEMI-OBSOLETE).
|
|
|
|
This reflects the fact that the topology (linear/circular) and
|
|
molecule type (e.g. DNA vs RNA) were a single field in early
|
|
files. Current GenBank/EMBL files have two fields.
|
|
"""
|
|
self._seq_type = type.strip()
|
|
|
|
def topology(self, topology):
|
|
"""Validate and record sequence topology.
|
|
|
|
The topology argument should be "linear" or "circular" (string).
|
|
"""
|
|
if topology:
|
|
if topology not in ["linear", "circular"]:
|
|
raise ParserFailureError(
|
|
f"Unexpected topology {topology!r} should be linear or circular"
|
|
)
|
|
self.data.annotations["topology"] = topology
|
|
|
|
def molecule_type(self, mol_type):
|
|
"""Validate and record the molecule type (for round-trip etc)."""
|
|
if mol_type:
|
|
if "circular" in mol_type or "linear" in mol_type:
|
|
raise ParserFailureError(
|
|
f"Molecule type {mol_type!r} should not include topology"
|
|
)
|
|
|
|
# Writing out records will fail if we have a lower case DNA
|
|
# or RNA string in here, so upper case it.
|
|
# This is a bit ugly, but we don't want to upper case e.g.
|
|
# the m in mRNA, but thanks to the strip we lost the spaces
|
|
# so we need to index from the back
|
|
if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper():
|
|
warnings.warn(
|
|
f"Non-upper case molecule type in LOCUS line: {mol_type}",
|
|
BiopythonParserWarning,
|
|
)
|
|
|
|
self.data.annotations["molecule_type"] = mol_type
|
|
|
|
def data_file_division(self, division):
|
|
self.data.annotations["data_file_division"] = division
|
|
|
|
def date(self, submit_date):
|
|
self.data.annotations["date"] = submit_date
|
|
|
|
def definition(self, definition):
|
|
"""Set the definition as the description of the sequence."""
|
|
if self.data.description:
|
|
# Append to any existing description
|
|
# e.g. EMBL files with two DE lines.
|
|
self.data.description += " " + definition
|
|
else:
|
|
self.data.description = definition
|
|
|
|
def accession(self, acc_num):
|
|
"""Set the accession number as the id of the sequence.
|
|
|
|
If we have multiple accession numbers, the first one passed is
|
|
used.
|
|
"""
|
|
new_acc_nums = self._split_accessions(acc_num)
|
|
|
|
# Also record them ALL in the annotations
|
|
try:
|
|
# On the off chance there was more than one accession line:
|
|
for acc in new_acc_nums:
|
|
# Prevent repeat entries
|
|
if acc not in self.data.annotations["accessions"]:
|
|
self.data.annotations["accessions"].append(acc)
|
|
except KeyError:
|
|
self.data.annotations["accessions"] = new_acc_nums
|
|
|
|
# if we haven't set the id information yet, add the first acc num
|
|
if not self.data.id:
|
|
if len(new_acc_nums) > 0:
|
|
# self.data.id = new_acc_nums[0]
|
|
# Use the FIRST accession as the ID, not the first on this line!
|
|
self.data.id = self.data.annotations["accessions"][0]
|
|
|
|
def tls(self, content):
|
|
self.data.annotations["tls"] = content.split("-")
|
|
|
|
def tsa(self, content):
|
|
self.data.annotations["tsa"] = content.split("-")
|
|
|
|
def wgs(self, content):
|
|
self.data.annotations["wgs"] = content.split("-")
|
|
|
|
def add_wgs_scafld(self, content):
|
|
self.data.annotations.setdefault("wgs_scafld", []).append(content.split("-"))
|
|
|
|
def nid(self, content):
|
|
self.data.annotations["nid"] = content
|
|
|
|
def pid(self, content):
|
|
self.data.annotations["pid"] = content
|
|
|
|
def version(self, version_id):
|
|
# Want to use the versioned accession as the record.id
|
|
# This comes from the VERSION line in GenBank files, or the
|
|
# obsolete SV line in EMBL. For the new EMBL files we need
|
|
# both the version suffix from the ID line and the accession
|
|
# from the AC line.
|
|
if version_id.count(".") == 1 and version_id.split(".")[1].isdigit():
|
|
self.accession(version_id.split(".")[0])
|
|
self.version_suffix(version_id.split(".")[1])
|
|
elif version_id:
|
|
# For backwards compatibility...
|
|
self.data.id = version_id
|
|
|
|
def project(self, content):
|
|
"""Handle the information from the PROJECT line as a list of projects.
|
|
|
|
e.g.::
|
|
|
|
PROJECT GenomeProject:28471
|
|
|
|
or::
|
|
|
|
PROJECT GenomeProject:13543 GenomeProject:99999
|
|
|
|
This is stored as dbxrefs in the SeqRecord to be consistent with the
|
|
projected switch of this line to DBLINK in future GenBank versions.
|
|
Note the NCBI plan to replace "GenomeProject:28471" with the shorter
|
|
"Project:28471" as part of this transition.
|
|
"""
|
|
content = content.replace("GenomeProject:", "Project:")
|
|
self.data.dbxrefs.extend(p for p in content.split() if p)
|
|
|
|
def dblink(self, content):
|
|
"""Store DBLINK cross references as dbxrefs in our record object.
|
|
|
|
This line type is expected to replace the PROJECT line in 2009. e.g.
|
|
|
|
During transition::
|
|
|
|
PROJECT GenomeProject:28471
|
|
DBLINK Project:28471
|
|
Trace Assembly Archive:123456
|
|
|
|
Once the project line is dropped::
|
|
|
|
DBLINK Project:28471
|
|
Trace Assembly Archive:123456
|
|
|
|
Note GenomeProject -> Project.
|
|
|
|
We'll have to see some real examples to be sure, but based on the
|
|
above example we can expect one reference per line.
|
|
|
|
Note that at some point the NCBI have included an extra space, e.g.::
|
|
|
|
DBLINK Project: 28471
|
|
|
|
"""
|
|
# During the transition period with both PROJECT and DBLINK lines,
|
|
# we don't want to add the same cross reference twice.
|
|
while ": " in content:
|
|
content = content.replace(": ", ":")
|
|
if content.strip() not in self.data.dbxrefs:
|
|
self.data.dbxrefs.append(content.strip())
|
|
|
|
def version_suffix(self, version):
|
|
"""Set the version to overwrite the id.
|
|
|
|
Since the version provides the same information as the accession
|
|
number, plus some extra info, we set this as the id if we have
|
|
a version.
|
|
"""
|
|
# e.g. GenBank line:
|
|
# VERSION U49845.1 GI:1293613
|
|
# or the obsolete EMBL line:
|
|
# SV U49845.1
|
|
# Scanner calls consumer.version("U49845.1")
|
|
# which then calls consumer.version_suffix(1)
|
|
#
|
|
# e.g. EMBL new line:
|
|
# ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
|
|
# Scanner calls consumer.version_suffix(1)
|
|
assert version.isdigit()
|
|
self.data.annotations["sequence_version"] = int(version)
|
|
|
|
def db_source(self, content):
|
|
self.data.annotations["db_source"] = content.rstrip()
|
|
|
|
def gi(self, content):
|
|
self.data.annotations["gi"] = content
|
|
|
|
def keywords(self, content):
|
|
if "keywords" in self.data.annotations:
|
|
# Multi-line keywords, append to list
|
|
# Note EMBL states "A keyword is never split between lines."
|
|
self.data.annotations["keywords"].extend(self._split_keywords(content))
|
|
else:
|
|
self.data.annotations["keywords"] = self._split_keywords(content)
|
|
|
|
def segment(self, content):
|
|
self.data.annotations["segment"] = content
|
|
|
|
def source(self, content):
|
|
# Note that some software (e.g. VectorNTI) may produce an empty
|
|
# source (rather than using a dot/period as might be expected).
|
|
if content == "":
|
|
source_info = ""
|
|
elif content[-1] == ".":
|
|
source_info = content[:-1]
|
|
else:
|
|
source_info = content
|
|
self.data.annotations["source"] = source_info
|
|
|
|
def organism(self, content):
|
|
self.data.annotations["organism"] = content
|
|
|
|
def taxonomy(self, content):
|
|
"""Record (another line of) the taxonomy lineage."""
|
|
lineage = self._split_taxonomy(content)
|
|
try:
|
|
self.data.annotations["taxonomy"].extend(lineage)
|
|
except KeyError:
|
|
self.data.annotations["taxonomy"] = lineage
|
|
|
|
def reference_num(self, content):
|
|
"""Signal the beginning of a new reference object."""
|
|
# if we have a current reference that hasn't been added to
|
|
# the list of references, add it.
|
|
if self._cur_reference is not None:
|
|
self.data.annotations["references"].append(self._cur_reference)
|
|
else:
|
|
self.data.annotations["references"] = []
|
|
|
|
self._cur_reference = Reference()
|
|
|
|
def reference_bases(self, content):
|
|
"""Attempt to determine the sequence region the reference entails.
|
|
|
|
Possible types of information we may have to deal with:
|
|
|
|
(bases 1 to 86436)
|
|
(sites)
|
|
(bases 1 to 105654; 110423 to 111122)
|
|
1 (residues 1 to 182)
|
|
"""
|
|
# first remove the parentheses
|
|
assert content.endswith(")"), content
|
|
ref_base_info = content[1:-1]
|
|
|
|
all_locations = []
|
|
# parse if we've got 'bases' and 'to'
|
|
if "bases" in ref_base_info and "to" in ref_base_info:
|
|
# get rid of the beginning 'bases'
|
|
ref_base_info = ref_base_info[5:]
|
|
locations = self._split_reference_locations(ref_base_info)
|
|
all_locations.extend(locations)
|
|
elif "residues" in ref_base_info and "to" in ref_base_info:
|
|
residues_start = ref_base_info.find("residues")
|
|
# get only the information after "residues"
|
|
ref_base_info = ref_base_info[(residues_start + len("residues ")) :]
|
|
locations = self._split_reference_locations(ref_base_info)
|
|
all_locations.extend(locations)
|
|
|
|
# make sure if we are not finding information then we have
|
|
# the string 'sites' or the string 'bases'
|
|
elif ref_base_info == "sites" or ref_base_info.strip() == "bases":
|
|
pass
|
|
# otherwise raise an error
|
|
else:
|
|
raise ValueError(
|
|
f"Could not parse base info {ref_base_info} in record {self.data.id}"
|
|
)
|
|
|
|
self._cur_reference.location = all_locations
|
|
|
|
def _split_reference_locations(self, location_string):
|
|
"""Get reference locations out of a string of reference information (PRIVATE).
|
|
|
|
The passed string should be of the form::
|
|
|
|
1 to 20; 20 to 100
|
|
|
|
This splits the information out and returns a list of location objects
|
|
based on the reference locations.
|
|
"""
|
|
# split possibly multiple locations using the ';'
|
|
all_base_info = location_string.split(";")
|
|
|
|
new_locations = []
|
|
for base_info in all_base_info:
|
|
start, end = base_info.split("to")
|
|
new_start, new_end = self._convert_to_python_numbers(
|
|
int(start.strip()), int(end.strip())
|
|
)
|
|
this_location = SimpleLocation(new_start, new_end)
|
|
new_locations.append(this_location)
|
|
return new_locations
|
|
|
|
def authors(self, content):
|
|
if self._cur_reference.authors:
|
|
self._cur_reference.authors += " " + content
|
|
else:
|
|
self._cur_reference.authors = content
|
|
|
|
def consrtm(self, content):
|
|
if self._cur_reference.consrtm:
|
|
self._cur_reference.consrtm += " " + content
|
|
else:
|
|
self._cur_reference.consrtm = content
|
|
|
|
def title(self, content):
|
|
if self._cur_reference is None:
|
|
warnings.warn(
|
|
"GenBank TITLE line without REFERENCE line.", BiopythonParserWarning
|
|
)
|
|
elif self._cur_reference.title:
|
|
self._cur_reference.title += " " + content
|
|
else:
|
|
self._cur_reference.title = content
|
|
|
|
def journal(self, content):
|
|
if self._cur_reference.journal:
|
|
self._cur_reference.journal += " " + content
|
|
else:
|
|
self._cur_reference.journal = content
|
|
|
|
def medline_id(self, content):
|
|
self._cur_reference.medline_id = content
|
|
|
|
def pubmed_id(self, content):
|
|
self._cur_reference.pubmed_id = content
|
|
|
|
def remark(self, content):
|
|
"""Deal with a reference comment."""
|
|
if self._cur_reference.comment:
|
|
self._cur_reference.comment += " " + content
|
|
else:
|
|
self._cur_reference.comment = content
|
|
|
|
def comment(self, content):
|
|
try:
|
|
self.data.annotations["comment"] += "\n" + "\n".join(content)
|
|
except KeyError:
|
|
self.data.annotations["comment"] = "\n".join(content)
|
|
|
|
def structured_comment(self, content):
|
|
self.data.annotations["structured_comment"] = content
|
|
|
|
def features_line(self, content):
|
|
"""Get ready for the feature table when we reach the FEATURE line."""
|
|
self.start_feature_table()
|
|
|
|
def start_feature_table(self):
|
|
"""Indicate we've got to the start of the feature table."""
|
|
# make sure we've added on our last reference object
|
|
if self._cur_reference is not None:
|
|
self.data.annotations["references"].append(self._cur_reference)
|
|
self._cur_reference = None
|
|
|
|
def feature_key(self, content):
|
|
# start a new feature
|
|
self._cur_feature = SeqFeature()
|
|
self._cur_feature.type = content
|
|
self.data.features.append(self._cur_feature)
|
|
|
|
def location(self, content):
|
|
"""Parse out location information from the location string.
|
|
|
|
This uses simple Python code with some regular expressions to do the
|
|
parsing, and then translates the results into appropriate objects.
|
|
"""
|
|
# clean up newlines and other whitespace inside the location before
|
|
# parsing - locations should have no whitespace whatsoever
|
|
location_line = self._clean_location(content)
|
|
|
|
# Older records have junk like replace(266,"c") in the
|
|
# location line. Newer records just replace this with
|
|
# the number 266 and have the information in a more reasonable
|
|
# place. So we'll just grab out the number and feed this to the
|
|
# parser. We shouldn't really be losing any info this way.
|
|
if "replace" in location_line:
|
|
comma_pos = location_line.find(",")
|
|
location_line = location_line[8:comma_pos]
|
|
|
|
length = self._expected_size
|
|
# Check if the sequence is circular for features that span the origin
|
|
is_circular = "circular" in self.data.annotations.get("topology", "").lower()
|
|
stranded = "PROTEIN" not in self._seq_type.upper()
|
|
|
|
try:
|
|
location = Location.fromstring(location_line, length, is_circular, stranded)
|
|
except LocationParserError as e:
|
|
warnings.warn(
|
|
f"{e}; setting feature location to None.", BiopythonParserWarning
|
|
)
|
|
location = None
|
|
self._cur_feature.location = location
|
|
|
|
def feature_qualifier(self, key, value):
|
|
"""When we get a qualifier key and its value.
|
|
|
|
Can receive None, since you can have valueless keys such as /pseudo
|
|
"""
|
|
# Hack to try to preserve historical behaviour of /pseudo etc
|
|
if value is None:
|
|
# if the key doesn't exist yet, add an empty string
|
|
if key not in self._cur_feature.qualifiers:
|
|
self._cur_feature.qualifiers[key] = [""]
|
|
return
|
|
# otherwise just skip this key
|
|
return
|
|
|
|
# Remove enclosing quotation marks
|
|
if len(value) > 1 and value[0] == '"' and value[-1] == '"':
|
|
value = value[1:-1]
|
|
|
|
# Handle NCBI escaping
|
|
# Warn if escaping is not according to standard
|
|
if re.search(r'[^"]"[^"]|^"[^"]|[^"]"$', value):
|
|
warnings.warn(
|
|
'The NCBI states double-quote characters like " should be escaped as "" '
|
|
"(two double - quotes), but here it was not: %r" % value,
|
|
BiopythonParserWarning,
|
|
)
|
|
# Undo escaping, repeated double quotes -> one double quote
|
|
value = value.replace('""', '"')
|
|
|
|
if self._feature_cleaner is not None:
|
|
value = self._feature_cleaner.clean_value(key, value)
|
|
|
|
# if the qualifier name exists, append the value
|
|
if key in self._cur_feature.qualifiers:
|
|
self._cur_feature.qualifiers[key].append(value)
|
|
# otherwise start a new list of the key with its values
|
|
else:
|
|
self._cur_feature.qualifiers[key] = [value]
|
|
|
|
def feature_qualifier_name(self, content_list):
|
|
"""Use feature_qualifier instead (OBSOLETE)."""
|
|
raise NotImplementedError("Use the feature_qualifier method instead.")
|
|
|
|
def feature_qualifier_description(self, content):
|
|
"""Use feature_qualifier instead (OBSOLETE)."""
|
|
raise NotImplementedError("Use the feature_qualifier method instead.")
|
|
|
|
def contig_location(self, content):
|
|
"""Deal with CONTIG information."""
|
|
# Historically this was stored as a SeqFeature object, but it was
|
|
# stored under record.annotations["contig"] and not under
|
|
# record.features with the other SeqFeature objects.
|
|
#
|
|
# The CONTIG location line can include additional tokens like
|
|
# Gap(), Gap(100) or Gap(unk100) which are not used in the feature
|
|
# location lines, so storing it using SeqFeature based location
|
|
# objects is difficult.
|
|
#
|
|
# We now store this a string, which means for BioSQL we are now in
|
|
# much better agreement with how BioPerl records the CONTIG line
|
|
# in the database.
|
|
#
|
|
# NOTE - This code assumes the scanner will return all the CONTIG
|
|
# lines already combined into one long string!
|
|
self.data.annotations["contig"] = content
|
|
|
|
def origin_name(self, content):
|
|
pass
|
|
|
|
def base_count(self, content):
|
|
pass
|
|
|
|
def base_number(self, content):
|
|
pass
|
|
|
|
def sequence(self, content):
|
|
"""Add up sequence information as we get it.
|
|
|
|
To try and make things speedier, this puts all of the strings
|
|
into a list of strings, and then uses string.join later to put
|
|
them together. Supposedly, this is a big time savings
|
|
"""
|
|
assert " " not in content
|
|
self._seq_data.append(content.upper())
|
|
|
|
def record_end(self, content):
|
|
"""Clean up when we've finished the record."""
|
|
# Try and append the version number to the accession for the full id
|
|
if not self.data.id:
|
|
if "accessions" in self.data.annotations:
|
|
raise ValueError(
|
|
"Problem adding version number to accession: "
|
|
+ str(self.data.annotations["accessions"])
|
|
)
|
|
self.data.id = self.data.name # Good fall back?
|
|
elif self.data.id.count(".") == 0:
|
|
try:
|
|
self.data.id += ".%i" % self.data.annotations["sequence_version"]
|
|
except KeyError:
|
|
pass
|
|
|
|
# add the sequence information
|
|
|
|
sequence = "".join(self._seq_data)
|
|
|
|
if (
|
|
self._expected_size is not None
|
|
and len(sequence) != 0
|
|
and self._expected_size != len(sequence)
|
|
):
|
|
warnings.warn(
|
|
"Expected sequence length %i, found %i (%s)."
|
|
% (self._expected_size, len(sequence), self.data.id),
|
|
BiopythonParserWarning,
|
|
)
|
|
|
|
molecule_type = None
|
|
if self._seq_type:
|
|
# mRNA is really also DNA, since it is actually cDNA
|
|
if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
|
|
molecule_type = "DNA"
|
|
# are there ever really RNA sequences in GenBank?
|
|
elif "RNA" in self._seq_type.upper():
|
|
# Even for data which was from RNA, the sequence string
|
|
# is usually given as DNA (T not U). Bug 3010
|
|
molecule_type = "RNA"
|
|
elif (
|
|
"PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
|
|
): # PRT is used in EMBL-bank for patents
|
|
molecule_type = "protein"
|
|
# work around ugly GenBank records which have circular or
|
|
# linear but no indication of sequence type
|
|
elif self._seq_type in ["circular", "linear", "unspecified"]:
|
|
pass
|
|
# we have a bug if we get here
|
|
else:
|
|
raise ValueError(
|
|
f"Could not determine molecule_type for seq_type {self._seq_type}"
|
|
)
|
|
# Don't overwrite molecule_type
|
|
if molecule_type is not None:
|
|
self.data.annotations["molecule_type"] = self.data.annotations.get(
|
|
"molecule_type", molecule_type
|
|
)
|
|
if not sequence and self._expected_size:
|
|
self.data.seq = Seq(None, length=self._expected_size)
|
|
else:
|
|
self.data.seq = Seq(sequence)
|
|
|
|
|
|
class _RecordConsumer(_BaseGenBankConsumer):
|
|
"""Create a GenBank Record object from scanner generated information (PRIVATE)."""
|
|
|
|
def __init__(self):
|
|
_BaseGenBankConsumer.__init__(self)
|
|
from . import Record
|
|
|
|
self.data = Record.Record()
|
|
|
|
self._seq_data = []
|
|
self._cur_reference = None
|
|
self._cur_feature = None
|
|
self._cur_qualifier = None
|
|
|
|
def tls(self, content):
|
|
self.data.tls = content.split("-")
|
|
|
|
def tsa(self, content):
|
|
self.data.tsa = content.split("-")
|
|
|
|
def wgs(self, content):
|
|
self.data.wgs = content.split("-")
|
|
|
|
def add_wgs_scafld(self, content):
|
|
self.data.wgs_scafld.append(content.split("-"))
|
|
|
|
def locus(self, content):
|
|
self.data.locus = content
|
|
|
|
def size(self, content):
|
|
self.data.size = content
|
|
|
|
def residue_type(self, content):
|
|
# Be lenient about parsing, but technically lowercase residue types are malformed.
|
|
if "dna" in content or "rna" in content:
|
|
warnings.warn(
|
|
f"Invalid seq_type ({content}): DNA/RNA should be uppercase.",
|
|
BiopythonParserWarning,
|
|
)
|
|
self.data.residue_type = content
|
|
|
|
def data_file_division(self, content):
|
|
self.data.data_file_division = content
|
|
|
|
def date(self, content):
|
|
self.data.date = content
|
|
|
|
def definition(self, content):
|
|
self.data.definition = content
|
|
|
|
def accession(self, content):
|
|
for acc in self._split_accessions(content):
|
|
if acc not in self.data.accession:
|
|
self.data.accession.append(acc)
|
|
|
|
def molecule_type(self, mol_type):
|
|
"""Validate and record the molecule type (for round-trip etc)."""
|
|
if mol_type:
|
|
if "circular" in mol_type or "linear" in mol_type:
|
|
raise ParserFailureError(
|
|
f"Molecule type {mol_type!r} should not include topology"
|
|
)
|
|
|
|
# Writing out records will fail if we have a lower case DNA
|
|
# or RNA string in here, so upper case it.
|
|
# This is a bit ugly, but we don't want to upper case e.g.
|
|
# the m in mRNA, but thanks to the strip we lost the spaces
|
|
# so we need to index from the back
|
|
if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper():
|
|
warnings.warn(
|
|
f"Non-upper case molecule type in LOCUS line: {mol_type}",
|
|
BiopythonParserWarning,
|
|
)
|
|
|
|
self.data.molecule_type = mol_type
|
|
|
|
def topology(self, topology):
|
|
"""Validate and record sequence topology.
|
|
|
|
The topology argument should be "linear" or "circular" (string).
|
|
"""
|
|
if topology:
|
|
if topology not in ["linear", "circular"]:
|
|
raise ParserFailureError(
|
|
f"Unexpected topology {topology!r} should be linear or circular"
|
|
)
|
|
self.data.topology = topology
|
|
|
|
def nid(self, content):
|
|
self.data.nid = content
|
|
|
|
def pid(self, content):
|
|
self.data.pid = content
|
|
|
|
def version(self, content):
|
|
self.data.version = content
|
|
|
|
def db_source(self, content):
|
|
self.data.db_source = content.rstrip()
|
|
|
|
def gi(self, content):
|
|
self.data.gi = content
|
|
|
|
def keywords(self, content):
|
|
self.data.keywords = self._split_keywords(content)
|
|
|
|
def project(self, content):
|
|
self.data.projects.extend(p for p in content.split() if p)
|
|
|
|
def dblink(self, content):
|
|
self.data.dblinks.append(content)
|
|
|
|
def segment(self, content):
|
|
self.data.segment = content
|
|
|
|
def source(self, content):
|
|
self.data.source = content
|
|
|
|
def organism(self, content):
|
|
self.data.organism = content
|
|
|
|
def taxonomy(self, content):
|
|
self.data.taxonomy = self._split_taxonomy(content)
|
|
|
|
def reference_num(self, content):
|
|
"""Grab the reference number and signal the start of a new reference."""
|
|
# check if we have a reference to add
|
|
if self._cur_reference is not None:
|
|
self.data.references.append(self._cur_reference)
|
|
|
|
from . import Record
|
|
|
|
self._cur_reference = Record.Reference()
|
|
self._cur_reference.number = content
|
|
|
|
def reference_bases(self, content):
|
|
self._cur_reference.bases = content
|
|
|
|
def authors(self, content):
|
|
self._cur_reference.authors = content
|
|
|
|
def consrtm(self, content):
|
|
self._cur_reference.consrtm = content
|
|
|
|
def title(self, content):
|
|
if self._cur_reference is None:
|
|
warnings.warn(
|
|
"GenBank TITLE line without REFERENCE line.", BiopythonParserWarning
|
|
)
|
|
return
|
|
self._cur_reference.title = content
|
|
|
|
def journal(self, content):
|
|
self._cur_reference.journal = content
|
|
|
|
def medline_id(self, content):
|
|
self._cur_reference.medline_id = content
|
|
|
|
def pubmed_id(self, content):
|
|
self._cur_reference.pubmed_id = content
|
|
|
|
def remark(self, content):
|
|
self._cur_reference.remark = content
|
|
|
|
def comment(self, content):
|
|
self.data.comment += "\n".join(content)
|
|
|
|
def structured_comment(self, content):
|
|
self.data.structured_comment = content
|
|
|
|
def primary_ref_line(self, content):
|
|
"""Save reference data for the PRIMARY line."""
|
|
self.data.primary.append(content)
|
|
|
|
def primary(self, content):
|
|
pass
|
|
|
|
def features_line(self, content):
|
|
"""Get ready for the feature table when we reach the FEATURE line."""
|
|
self.start_feature_table()
|
|
|
|
def start_feature_table(self):
|
|
"""Signal the start of the feature table."""
|
|
# we need to add on the last reference
|
|
if self._cur_reference is not None:
|
|
self.data.references.append(self._cur_reference)
|
|
|
|
def feature_key(self, content):
|
|
"""Grab the key of the feature and signal the start of a new feature."""
|
|
# first add on feature information if we've got any
|
|
self._add_feature()
|
|
|
|
from . import Record
|
|
|
|
self._cur_feature = Record.Feature()
|
|
self._cur_feature.key = content
|
|
|
|
def _add_feature(self):
|
|
"""Add a feature to the record, with relevant checks (PRIVATE).
|
|
|
|
This does all of the appropriate checking to make sure we haven't
|
|
left any info behind, and that we are only adding info if it
|
|
exists.
|
|
"""
|
|
if self._cur_feature is not None:
|
|
# if we have a left over qualifier, add it to the qualifiers
|
|
# on the current feature
|
|
if self._cur_qualifier is not None:
|
|
self._cur_feature.qualifiers.append(self._cur_qualifier)
|
|
|
|
self._cur_qualifier = None
|
|
self.data.features.append(self._cur_feature)
|
|
|
|
def location(self, content):
|
|
self._cur_feature.location = self._clean_location(content)
|
|
|
|
def feature_qualifier(self, key, value):
|
|
self.feature_qualifier_name([key])
|
|
if value is not None:
|
|
self.feature_qualifier_description(value)
|
|
|
|
def feature_qualifier_name(self, content_list):
|
|
"""Deal with qualifier names.
|
|
|
|
We receive a list of keys, since you can have valueless keys such as
|
|
/pseudo which would be passed in with the next key (since no other
|
|
tags separate them in the file)
|
|
"""
|
|
from . import Record
|
|
|
|
for content in content_list:
|
|
# the record parser keeps the /s -- add them if we don't have 'em
|
|
if not content.startswith("/"):
|
|
content = f"/{content}"
|
|
# add on a qualifier if we've got one
|
|
if self._cur_qualifier is not None:
|
|
self._cur_feature.qualifiers.append(self._cur_qualifier)
|
|
|
|
self._cur_qualifier = Record.Qualifier()
|
|
self._cur_qualifier.key = content
|
|
|
|
def feature_qualifier_description(self, content):
|
|
# if we have info then the qualifier key should have a ='s
|
|
if "=" not in self._cur_qualifier.key:
|
|
self._cur_qualifier.key = f"{self._cur_qualifier.key}="
|
|
cur_content = self._remove_newlines(content)
|
|
# remove all spaces from the value if it is a type where spaces
|
|
# are not important
|
|
for remove_space_key in self.__class__.remove_space_keys:
|
|
if remove_space_key in self._cur_qualifier.key:
|
|
cur_content = self._remove_spaces(cur_content)
|
|
self._cur_qualifier.value = self._normalize_spaces(cur_content)
|
|
|
|
def base_count(self, content):
|
|
self.data.base_counts = content
|
|
|
|
def origin_name(self, content):
|
|
self.data.origin = content
|
|
|
|
def contig_location(self, content):
|
|
"""Signal that we have contig information to add to the record."""
|
|
self.data.contig = self._clean_location(content)
|
|
|
|
def sequence(self, content):
|
|
"""Add sequence information to a list of sequence strings.
|
|
|
|
This removes spaces in the data and uppercases the sequence, and
|
|
then adds it to a list of sequences. Later on we'll join this
|
|
list together to make the final sequence. This is faster than
|
|
adding on the new string every time.
|
|
"""
|
|
assert " " not in content
|
|
self._seq_data.append(content.upper())
|
|
|
|
def record_end(self, content):
|
|
"""Signal the end of the record and do any necessary clean-up."""
|
|
# add together all of the sequence parts to create the
|
|
# final sequence string
|
|
self.data.sequence = "".join(self._seq_data)
|
|
# add on the last feature
|
|
self._add_feature()
|
|
|
|
|
|
def parse(handle):
|
|
"""Iterate over GenBank formatted entries as Record objects.
|
|
|
|
>>> from Bio import GenBank
|
|
>>> with open("GenBank/NC_000932.gb") as handle:
|
|
... for record in GenBank.parse(handle):
|
|
... print(record.accession)
|
|
['NC_000932']
|
|
|
|
To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb")
|
|
instead.
|
|
"""
|
|
return iter(Iterator(handle, RecordParser()))
|
|
|
|
|
|
def read(handle):
|
|
"""Read a handle containing a single GenBank entry as a Record object.
|
|
|
|
>>> from Bio import GenBank
|
|
>>> with open("GenBank/NC_000932.gb") as handle:
|
|
... record = GenBank.read(handle)
|
|
... print(record.accession)
|
|
['NC_000932']
|
|
|
|
To get a SeqRecord object use Bio.SeqIO.read(..., format="gb")
|
|
instead.
|
|
"""
|
|
iterator = parse(handle)
|
|
try:
|
|
record = next(iterator)
|
|
except StopIteration:
|
|
raise ValueError("No records found in handle") from None
|
|
try:
|
|
next(iterator)
|
|
raise ValueError("More than one record found in handle")
|
|
except StopIteration:
|
|
pass
|
|
return record
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from Bio._utils import run_doctest
|
|
|
|
run_doctest()
|