mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
1261 lines
50 KiB
Python
1261 lines
50 KiB
Python
# Copyright 2002 by Andrew Dalke. All rights reserved.
|
|
# Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
|
|
# Revisions 2008 copyright by Cymon J. Cox. All rights reserved.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
#
|
|
# Note that BioSQL (including the database schema and scripts) is
|
|
# available and licensed separately. Please consult www.biosql.org
|
|
|
|
"""Load biopython objects into a BioSQL database for persistent storage.
|
|
|
|
This code makes it possible to store biopython objects in a relational
|
|
database and then retrieve them back. You shouldn't use any of the
|
|
classes in this module directly. Rather, call the load() method on
|
|
a database object.
|
|
"""
|
|
|
|
# standard modules
|
|
from time import gmtime
|
|
from time import strftime
|
|
|
|
from Bio import Entrez
|
|
from Bio.Seq import UndefinedSequenceError
|
|
from Bio.SeqFeature import UnknownPosition
|
|
|
|
# biopython
|
|
from Bio.SeqUtils.CheckSum import crc64
|
|
|
|
|
|
class DatabaseLoader:
|
|
"""Object used to load SeqRecord objects into a BioSQL database."""
|
|
|
|
def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
|
|
"""Initialize with connection information for the database.
|
|
|
|
Creating a DatabaseLoader object is normally handled via the
|
|
BioSeqDatabase DBServer object, for example::
|
|
|
|
from BioSQL import BioSeqDatabase
|
|
server = BioSeqDatabase.open_database(driver="MySQLdb",
|
|
user="gbrowse",
|
|
passwd="biosql",
|
|
host="localhost",
|
|
db="test_biosql")
|
|
try:
|
|
db = server["test"]
|
|
except KeyError:
|
|
db = server.new_database("test",
|
|
description="For testing GBrowse")
|
|
|
|
"""
|
|
self.adaptor = adaptor
|
|
self.dbid = dbid
|
|
self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
|
|
|
|
def load_seqrecord(self, record):
|
|
"""Load a Biopython SeqRecord into the database."""
|
|
bioentry_id = self._load_bioentry_table(record)
|
|
self._load_bioentry_date(record, bioentry_id)
|
|
self._load_biosequence(record, bioentry_id)
|
|
self._load_comment(record, bioentry_id)
|
|
self._load_dbxrefs(record, bioentry_id)
|
|
references = record.annotations.get("references", ())
|
|
for reference, rank in zip(references, list(range(len(references)))):
|
|
self._load_reference(reference, rank, bioentry_id)
|
|
self._load_annotations(record, bioentry_id)
|
|
for seq_feature_num in range(len(record.features)):
|
|
seq_feature = record.features[seq_feature_num]
|
|
self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
|
|
|
|
def _get_ontology_id(self, name, definition=None):
|
|
"""Return identifier for the named ontology (PRIVATE).
|
|
|
|
This looks through the onotology table for a the given entry name.
|
|
If it is not found, a row is added for this ontology (using the
|
|
definition if supplied). In either case, the id corresponding to
|
|
the provided name is returned, so that you can reference it in
|
|
another table.
|
|
"""
|
|
oids = self.adaptor.execute_and_fetch_col0(
|
|
"SELECT ontology_id FROM ontology WHERE name = %s", (name,)
|
|
)
|
|
if oids:
|
|
return oids[0]
|
|
self.adaptor.execute(
|
|
"INSERT INTO ontology(name, definition) VALUES (%s, %s)", (name, definition)
|
|
)
|
|
return self.adaptor.last_id("ontology")
|
|
|
|
def _get_term_id(self, name, ontology_id=None, definition=None, identifier=None):
|
|
"""Get the id that corresponds to a term (PRIVATE).
|
|
|
|
This looks through the term table for a the given term. If it
|
|
is not found, a new id corresponding to this term is created.
|
|
In either case, the id corresponding to that term is returned, so
|
|
that you can reference it in another table.
|
|
|
|
The ontology_id should be used to disambiguate the term.
|
|
"""
|
|
# try to get the term id
|
|
sql = "SELECT term_id FROM term WHERE name = %s"
|
|
fields = [name]
|
|
if ontology_id:
|
|
sql += " AND ontology_id = %s"
|
|
fields.append(ontology_id)
|
|
id_results = self.adaptor.execute_and_fetchall(sql, fields)
|
|
# something is wrong
|
|
if len(id_results) > 1:
|
|
raise ValueError(f"Multiple term ids for {name}: {id_results!r}")
|
|
elif len(id_results) == 1:
|
|
return id_results[0][0]
|
|
else:
|
|
sql = (
|
|
"INSERT INTO term (name, definition,"
|
|
" identifier, ontology_id)"
|
|
" VALUES (%s, %s, %s, %s)"
|
|
)
|
|
self.adaptor.execute(sql, (name, definition, identifier, ontology_id))
|
|
return self.adaptor.last_id("term")
|
|
|
|
def _add_dbxref(self, dbname, accession, version):
|
|
"""Insert a dbxref and return its id (PRIVATE)."""
|
|
self.adaptor.execute(
|
|
"INSERT INTO dbxref(dbname, accession, version) VALUES (%s, %s, %s)",
|
|
(dbname, accession, version),
|
|
)
|
|
return self.adaptor.last_id("dbxref")
|
|
|
|
def _get_taxon_id(self, record):
|
|
"""Get the taxon id for this record (PRIVATE).
|
|
|
|
Arguments:
|
|
- record - a SeqRecord object
|
|
|
|
This searches the taxon/taxon_name tables using the
|
|
NCBI taxon ID, scientific name and common name to find
|
|
the matching taxon table entry's id.
|
|
|
|
If the species isn't in the taxon table, and we have at
|
|
least the NCBI taxon ID, scientific name or common name,
|
|
at least a minimal stub entry is created in the table.
|
|
|
|
Returns the taxon id (database key for the taxon table,
|
|
not an NCBI taxon ID), or None if the taxonomy information
|
|
is missing.
|
|
|
|
See also the BioSQL script load_ncbi_taxonomy.pl which
|
|
will populate and update the taxon/taxon_name tables
|
|
with the latest information from the NCBI.
|
|
"""
|
|
# To find the NCBI taxid, first check for a top level annotation
|
|
ncbi_taxon_id = None
|
|
if "ncbi_taxid" in record.annotations:
|
|
# Could be a list of IDs.
|
|
if isinstance(record.annotations["ncbi_taxid"], list):
|
|
if len(record.annotations["ncbi_taxid"]) == 1:
|
|
ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
|
|
else:
|
|
ncbi_taxon_id = record.annotations["ncbi_taxid"]
|
|
if not ncbi_taxon_id:
|
|
# Secondly, look for a source feature
|
|
for f in record.features:
|
|
if f.type == "source":
|
|
quals = getattr(f, "qualifiers", {})
|
|
if "db_xref" in quals:
|
|
for db_xref in f.qualifiers["db_xref"]:
|
|
if db_xref.startswith("taxon:"):
|
|
ncbi_taxon_id = int(db_xref[6:])
|
|
break
|
|
if ncbi_taxon_id:
|
|
break
|
|
|
|
try:
|
|
scientific_name = record.annotations["organism"][:255]
|
|
except KeyError:
|
|
scientific_name = None
|
|
try:
|
|
common_name = record.annotations["source"][:255]
|
|
except KeyError:
|
|
common_name = None
|
|
# Note: The maximum length for taxon names in the schema is 255.
|
|
# Cropping it now should help in getting a match when searching,
|
|
# and avoids an error if we try and add these to the database.
|
|
|
|
if ncbi_taxon_id:
|
|
# Good, we have the NCBI taxon to go on - this is unambiguous :)
|
|
# Note that the scientific name and common name will only be
|
|
# used if we have to record a stub entry.
|
|
return self._get_taxon_id_from_ncbi_taxon_id(
|
|
ncbi_taxon_id, scientific_name, common_name
|
|
)
|
|
|
|
if not common_name and not scientific_name:
|
|
# Nothing to go on... and there is no point adding
|
|
# a new entry to the database. We'll just leave this
|
|
# sequence's taxon as a NULL in the database.
|
|
return None
|
|
|
|
# Next, we'll try to find a match based on the species name
|
|
# (stored in GenBank files as the organism and/or the source).
|
|
if scientific_name:
|
|
taxa = self.adaptor.execute_and_fetch_col0(
|
|
"SELECT taxon_id FROM taxon_name"
|
|
" WHERE name_class = 'scientific name' AND name = %s",
|
|
(scientific_name,),
|
|
)
|
|
if taxa:
|
|
# Good, mapped the scientific name to a taxon table entry
|
|
return taxa[0]
|
|
|
|
# Last chance...
|
|
if common_name:
|
|
taxa = self.adaptor.execute_and_fetch_col0(
|
|
"SELECT DISTINCT taxon_id FROM taxon_name WHERE name = %s",
|
|
(common_name,),
|
|
)
|
|
# Its natural that several distinct taxa will have the same common
|
|
# name - in which case we can't resolve the taxon uniquely.
|
|
if len(taxa) > 1:
|
|
raise ValueError(
|
|
"Taxa: %d species have name %r" % (len(taxa), common_name)
|
|
)
|
|
if taxa:
|
|
# Good, mapped the common name to a taxon table entry
|
|
return taxa[0]
|
|
|
|
# At this point, as far as we can tell, this species isn't
|
|
# in the taxon table already. So we'll have to add it.
|
|
# We don't have an NCBI taxonomy ID, so if we do record just
|
|
# a stub entry, there is no simple way to fix this later.
|
|
#
|
|
# TODO - Should we try searching the NCBI taxonomy using the
|
|
# species name?
|
|
#
|
|
# OK, let's try inserting the species.
|
|
# Chances are we don't have enough information ...
|
|
# Furthermore, it won't be in the hierarchy.
|
|
|
|
lineage = []
|
|
for c in record.annotations.get("taxonomy", []):
|
|
lineage.append([None, None, c])
|
|
if lineage:
|
|
lineage[-1][1] = "genus"
|
|
lineage.append([None, "species", record.annotations["organism"]])
|
|
# XXX do we have them?
|
|
if "subspecies" in record.annotations:
|
|
lineage.append([None, "subspecies", record.annotations["subspecies"]])
|
|
if "variant" in record.annotations:
|
|
lineage.append([None, "varietas", record.annotations["variant"]])
|
|
lineage[-1][0] = ncbi_taxon_id
|
|
|
|
left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[0]
|
|
if not left_value:
|
|
left_value = 0
|
|
left_value += 1
|
|
|
|
# XXX -- Brad: Fixing this for now in an ugly way because
|
|
# I am getting overlaps for right_values. I need to dig into this
|
|
# more to actually understand how it works. I'm not sure it is
|
|
# actually working right anyhow.
|
|
right_start_value = self.adaptor.execute_one(
|
|
"SELECT MAX(right_value) FROM taxon"
|
|
)[0]
|
|
if not right_start_value:
|
|
right_start_value = 0
|
|
right_value = right_start_value + 2 * len(lineage) - 1
|
|
|
|
parent_taxon_id = None
|
|
for taxon in lineage:
|
|
self.adaptor.execute(
|
|
"INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
|
|
" left_value, right_value)"
|
|
" VALUES (%s, %s, %s, %s, %s)",
|
|
(parent_taxon_id, taxon[0], taxon[1], left_value, right_value),
|
|
)
|
|
taxon_id = self.adaptor.last_id("taxon")
|
|
self.adaptor.execute(
|
|
"INSERT INTO taxon_name(taxon_id, name, name_class)"
|
|
"VALUES (%s, %s, 'scientific name')",
|
|
(taxon_id, taxon[2][:255]),
|
|
)
|
|
# Note the name field is limited to 255, some SwissProt files
|
|
# have a multi-species name which can be longer. So truncate this.
|
|
left_value += 1
|
|
right_value -= 1
|
|
parent_taxon_id = taxon_id
|
|
if common_name:
|
|
self.adaptor.execute(
|
|
"INSERT INTO taxon_name(taxon_id, name, name_class)"
|
|
"VALUES (%s, %s, 'common name')",
|
|
(taxon_id, common_name),
|
|
)
|
|
|
|
return taxon_id
|
|
|
|
def _fix_name_class(self, entrez_name):
|
|
"""Map Entrez name terms to those used in taxdump (PRIVATE).
|
|
|
|
We need to make this conversion to match the taxon_name.name_class
|
|
values used by the BioSQL load_ncbi_taxonomy.pl script.
|
|
|
|
e.g.::
|
|
|
|
"ScientificName" -> "scientific name",
|
|
"EquivalentName" -> "equivalent name",
|
|
"Synonym" -> "synonym",
|
|
|
|
"""
|
|
# Add any special cases here:
|
|
#
|
|
# known = {}
|
|
# try:
|
|
# return known[entrez_name]
|
|
# except KeyError:
|
|
# pass
|
|
|
|
# Try automatically by adding spaces before each capital
|
|
def add_space(letter):
|
|
"""Add a space before a capital letter."""
|
|
if letter.isupper():
|
|
return " " + letter.lower()
|
|
else:
|
|
return letter
|
|
|
|
answer = "".join(add_space(letter) for letter in entrez_name).strip()
|
|
if answer != answer.lower():
|
|
raise ValueError(
|
|
f"Expected processed entrez_name, '{answer}' to only have lower case letters."
|
|
)
|
|
return answer
|
|
|
|
def _update_left_right_taxon_values(self, left_value):
|
|
"""Update the left and right taxon values in the table (PRIVATE)."""
|
|
if not left_value:
|
|
return
|
|
# Due to the UNIQUE constraint on the left and right values in the taxon
|
|
# table we cannot simply update them through an SQL statement as we risk
|
|
# colliding values. Instead we must select all of the rows that we want to
|
|
# update, modify the values in python and then update the rows
|
|
# self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 "
|
|
# "WHERE right_value >= %s", (left_value,))
|
|
# self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 "
|
|
# "WHERE left_value > %s", (left_value,))
|
|
|
|
rows = self.adaptor.execute_and_fetchall(
|
|
"SELECT left_value, right_value, taxon_id FROM taxon "
|
|
"WHERE right_value >= %s or left_value > %s",
|
|
(left_value, left_value),
|
|
)
|
|
|
|
right_rows = []
|
|
left_rows = []
|
|
for row in rows:
|
|
new_right = row[1]
|
|
new_left = row[0]
|
|
if new_right >= left_value:
|
|
new_right += 2
|
|
|
|
if new_left > left_value:
|
|
new_left += 2
|
|
right_rows.append((new_right, row[2]))
|
|
left_rows.append((new_left, row[2]))
|
|
|
|
# sort the rows based on the value from largest to smallest
|
|
# should ensure no overlaps
|
|
right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True)
|
|
left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True)
|
|
|
|
self.adaptor.executemany(
|
|
"UPDATE taxon SET left_value = %s WHERE taxon_id = %s", left_rows
|
|
)
|
|
self.adaptor.executemany(
|
|
"UPDATE taxon SET right_value = %s WHERE taxon_id = %s", right_rows
|
|
)
|
|
|
|
def _get_taxon_id_from_ncbi_taxon_id(
|
|
self, ncbi_taxon_id, scientific_name=None, common_name=None
|
|
):
|
|
"""Get the taxon id for record from NCBI taxon ID (PRIVATE).
|
|
|
|
Arguments:
|
|
- ncbi_taxon_id - string containing an NCBI taxon id
|
|
- scientific_name - string, used if a stub entry is recorded
|
|
- common_name - string, used if a stub entry is recorded
|
|
|
|
This searches the taxon table using ONLY the NCBI taxon ID
|
|
to find the matching taxon table entry's ID (database key).
|
|
|
|
If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
|
|
flag is true, Biopython will attempt to go online using Bio.Entrez
|
|
to fetch the official NCBI lineage, recursing up the tree until an
|
|
existing entry is found in the database or the full lineage has been
|
|
fetched.
|
|
|
|
Otherwise the NCBI taxon ID, scientific name and common name are
|
|
recorded as a minimal stub entry in the taxon and taxon_name tables.
|
|
Any partial information about the lineage from the SeqRecord is NOT
|
|
recorded. This should mean that (re)running the BioSQL script
|
|
load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
|
|
|
|
Returns the taxon id (database key for the taxon table, not
|
|
an NCBI taxon ID).
|
|
"""
|
|
if not ncbi_taxon_id:
|
|
raise ValueError("Expected a non-empty value for ncbi_taxon_id.")
|
|
|
|
taxon_id = self.adaptor.execute_and_fetch_col0(
|
|
"SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", (int(ncbi_taxon_id),)
|
|
)
|
|
if taxon_id:
|
|
# Good, we have mapped the NCBI taxid to a taxon table entry
|
|
return taxon_id[0]
|
|
|
|
# At this point, as far as we can tell, this species isn't
|
|
# in the taxon table already. So we'll have to add it.
|
|
|
|
parent_taxon_id = None
|
|
rank = "species"
|
|
genetic_code = None
|
|
mito_genetic_code = None
|
|
parent_left_value = None
|
|
parent_right_value = None
|
|
left_value = None
|
|
right_value = None
|
|
species_names = []
|
|
if scientific_name:
|
|
species_names.append(("scientific name", scientific_name))
|
|
if common_name:
|
|
species_names.append(("common name", common_name))
|
|
|
|
if self.fetch_NCBI_taxonomy:
|
|
# Go online to get the parent taxon ID!
|
|
handle = Entrez.efetch(db="taxonomy", id=ncbi_taxon_id, retmode="XML")
|
|
taxonomic_record = Entrez.read(handle)
|
|
if len(taxonomic_record) == 1:
|
|
if taxonomic_record[0]["TaxId"] != str(ncbi_taxon_id):
|
|
raise ValueError(
|
|
f"ncbi_taxon_id different from parent taxon id. {ncbi_taxon_id} versus {taxonomic_record[0]['TaxId']}"
|
|
)
|
|
|
|
(
|
|
parent_taxon_id,
|
|
parent_left_value,
|
|
parent_right_value,
|
|
) = self._get_taxon_id_from_ncbi_lineage(
|
|
taxonomic_record[0]["LineageEx"]
|
|
)
|
|
|
|
left_value = parent_right_value
|
|
right_value = parent_right_value + 1
|
|
|
|
rank = str(taxonomic_record[0]["Rank"])
|
|
|
|
genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"])
|
|
|
|
mito_genetic_code = int(taxonomic_record[0]["MitoGeneticCode"]["MGCId"])
|
|
|
|
species_names = [
|
|
("scientific name", str(taxonomic_record[0]["ScientificName"]))
|
|
]
|
|
try:
|
|
for name_class, names in taxonomic_record[0]["OtherNames"].items():
|
|
name_class = self._fix_name_class(name_class)
|
|
if not isinstance(names, list):
|
|
# The Entrez parser seems to return single entry
|
|
# lists as just a string which is annoying.
|
|
names = [names]
|
|
for name in names:
|
|
# Want to ignore complex things like ClassCDE
|
|
# entries
|
|
if isinstance(name, str):
|
|
species_names.append((name_class, name))
|
|
except KeyError:
|
|
# OtherNames isn't always present,
|
|
# e.g. NCBI taxon 41205, Bromheadia finlaysoniana
|
|
pass
|
|
else:
|
|
pass
|
|
# If we are not allowed to go online, we will record the bare minimum;
|
|
# as long as the NCBI taxon id is present, then (re)running
|
|
# load_ncbi_taxonomy.pl should fill in the taxonomomy lineage
|
|
# (and update the species names).
|
|
#
|
|
# I am NOT going to try and record the lineage, even if it
|
|
# is in the record annotation as a list of names, as we won't
|
|
# know the NCBI taxon IDs for these parent nodes.
|
|
|
|
self._update_left_right_taxon_values(left_value)
|
|
|
|
self.adaptor.execute(
|
|
"INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
|
|
" genetic_code, mito_genetic_code, left_value, right_value)"
|
|
" VALUES (%s, %s, %s, %s, %s, %s, %s)",
|
|
(
|
|
parent_taxon_id,
|
|
ncbi_taxon_id,
|
|
rank,
|
|
genetic_code,
|
|
mito_genetic_code,
|
|
left_value,
|
|
right_value,
|
|
),
|
|
)
|
|
|
|
taxon_id = self.adaptor.last_id("taxon")
|
|
|
|
# Record the scientific name, common name, etc
|
|
for name_class, name in species_names:
|
|
self.adaptor.execute(
|
|
"INSERT INTO taxon_name(taxon_id, name, name_class)"
|
|
" VALUES (%s, %s, %s)",
|
|
(taxon_id, name[:255], name_class),
|
|
)
|
|
return taxon_id
|
|
|
|
def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
|
|
"""Recursive method to get taxon ID from NCBI lineage (PRIVATE).
|
|
|
|
Arguments:
|
|
- taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
|
|
|
|
First dictionary in list is the taxonomy root, highest would be
|
|
the species. Each dictionary includes:
|
|
|
|
- TaxID (string, NCBI taxon id)
|
|
- Rank (string, e.g. "species", "genus", ..., "phylum", ...)
|
|
- ScientificName (string)
|
|
|
|
(and that is all at the time of writing)
|
|
|
|
This method will record all the lineage given, returning the taxon id
|
|
(database key, not NCBI taxon id) of the final entry (the species).
|
|
"""
|
|
ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"])
|
|
left_value = None
|
|
right_value = None
|
|
parent_left_value = None
|
|
parent_right_value = None
|
|
# Is this in the database already? Check the taxon table...
|
|
rows = self.adaptor.execute_and_fetchall(
|
|
"SELECT taxon_id, left_value, right_value FROM taxon"
|
|
" WHERE ncbi_taxon_id=%s" % ncbi_taxon_id
|
|
)
|
|
if rows:
|
|
# we could verify that the Scientific Name etc in the database
|
|
# is the same and update it or print a warning if not...
|
|
if len(rows) != 1:
|
|
raise ValueError(f"Expected 1 response, got {len(rows)}")
|
|
return rows[0]
|
|
|
|
# We have to record this.
|
|
if len(taxonomic_lineage) > 1:
|
|
# Use recursion to find out the taxon id (database key) of the
|
|
# parent.
|
|
(
|
|
parent_taxon_id,
|
|
parent_left_value,
|
|
parent_right_value,
|
|
) = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
|
|
left_value = parent_right_value
|
|
right_value = parent_right_value + 1
|
|
if not isinstance(parent_taxon_id, int):
|
|
raise ValueError(
|
|
f"Expected parent_taxon_id to be an int, got {parent_taxon_id}"
|
|
)
|
|
else:
|
|
# we have reached the top of the lineage but no current taxonomy
|
|
# id has been found
|
|
parent_taxon_id = None
|
|
left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[
|
|
0
|
|
]
|
|
if not left_value:
|
|
left_value = 0
|
|
|
|
right_value = left_value + 1
|
|
|
|
self._update_left_right_taxon_values(left_value)
|
|
|
|
# INSERT new taxon
|
|
rank = str(taxonomic_lineage[-1].get("Rank"))
|
|
self.adaptor.execute(
|
|
"INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, "
|
|
"left_value, right_value) VALUES (%s, %s, %s, %s, %s)",
|
|
(ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value),
|
|
)
|
|
|
|
taxon_id = self.adaptor.last_id("taxon")
|
|
# assert isinstance(taxon_id, int), repr(taxon_id)
|
|
# ... and its name in taxon_name
|
|
scientific_name = taxonomic_lineage[-1].get("ScientificName")
|
|
if scientific_name:
|
|
self.adaptor.execute(
|
|
"INSERT INTO taxon_name(taxon_id, name, name_class) "
|
|
"VALUES (%s, %s, 'scientific name')",
|
|
(taxon_id, scientific_name[:255]),
|
|
)
|
|
return taxon_id, left_value, right_value
|
|
|
|
def _load_bioentry_table(self, record):
|
|
"""Fill the bioentry table with sequence information (PRIVATE).
|
|
|
|
Arguments:
|
|
- record - SeqRecord object to add to the database.
|
|
|
|
"""
|
|
# get the pertinent info and insert it
|
|
|
|
if record.id.count(".") == 1: # try to get a version from the id
|
|
# This assumes the string is something like "XXXXXXXX.123"
|
|
accession, version = record.id.split(".")
|
|
try:
|
|
version = int(version)
|
|
except ValueError:
|
|
accession = record.id
|
|
version = 0
|
|
else: # otherwise just use a version of 0
|
|
accession = record.id
|
|
version = 0
|
|
|
|
if (
|
|
"accessions" in record.annotations
|
|
and isinstance(record.annotations["accessions"], list)
|
|
and record.annotations["accessions"]
|
|
):
|
|
# Take the first accession (one if there is more than one)
|
|
accession = record.annotations["accessions"][0]
|
|
|
|
# Find the taxon id (this is not just the NCBI Taxon ID)
|
|
# NOTE - If the species isn't defined in the taxon table,
|
|
# a new minimal entry is created.
|
|
taxon_id = self._get_taxon_id(record)
|
|
|
|
if "gi" in record.annotations:
|
|
identifier = record.annotations["gi"]
|
|
else:
|
|
identifier = record.id
|
|
|
|
# Allow description and division to default to NULL as in BioPerl.
|
|
description = getattr(record, "description", None)
|
|
division = record.annotations.get("data_file_division")
|
|
|
|
sql = """
|
|
INSERT INTO bioentry (
|
|
biodatabase_id,
|
|
taxon_id,
|
|
name,
|
|
accession,
|
|
identifier,
|
|
division,
|
|
description,
|
|
version)
|
|
VALUES (
|
|
%s,
|
|
%s,
|
|
%s,
|
|
%s,
|
|
%s,
|
|
%s,
|
|
%s,
|
|
%s)"""
|
|
# print(self.dbid, taxon_id, record.name, accession, identifier, \
|
|
# division, description, version)
|
|
self.adaptor.execute(
|
|
sql,
|
|
(
|
|
self.dbid,
|
|
taxon_id,
|
|
record.name,
|
|
accession,
|
|
identifier,
|
|
division,
|
|
description,
|
|
version,
|
|
),
|
|
)
|
|
# now retrieve the id for the bioentry
|
|
return self.adaptor.last_id("bioentry")
|
|
|
|
def _load_bioentry_date(self, record, bioentry_id):
|
|
"""Add the effective date of the entry into the database (PRIVATE).
|
|
|
|
record - a SeqRecord object with an annotated date
|
|
bioentry_id - corresponding database identifier
|
|
"""
|
|
# dates are GenBank style, like:
|
|
# 14-SEP-2000
|
|
date = record.annotations.get("date", strftime("%d-%b-%Y", gmtime()).upper())
|
|
if isinstance(date, list):
|
|
date = date[0]
|
|
annotation_tags_id = self._get_ontology_id("Annotation Tags")
|
|
date_id = self._get_term_id("date_changed", annotation_tags_id)
|
|
sql = (
|
|
"INSERT INTO bioentry_qualifier_value"
|
|
' (bioentry_id, term_id, value, "rank")'
|
|
" VALUES (%s, %s, %s, 1)"
|
|
)
|
|
self.adaptor.execute(sql, (bioentry_id, date_id, date))
|
|
|
|
def _load_biosequence(self, record, bioentry_id):
|
|
"""Record SeqRecord's sequence and alphabet in DB (PRIVATE).
|
|
|
|
Arguments:
|
|
- record - a SeqRecord object with a seq property
|
|
- bioentry_id - corresponding database identifier
|
|
|
|
"""
|
|
if record.seq is None:
|
|
# The biosequence table entry is optional, so if we haven't
|
|
# got a sequence, we don't need to write to the table.
|
|
return
|
|
|
|
molecule_type = record.annotations.get("molecule_type", "")
|
|
if "DNA" in molecule_type:
|
|
alphabet = "dna"
|
|
elif "RNA" in molecule_type:
|
|
alphabet = "rna"
|
|
elif "protein" in molecule_type:
|
|
alphabet = "protein"
|
|
else:
|
|
alphabet = "unknown"
|
|
|
|
try:
|
|
seq_str = str(record.seq)
|
|
except UndefinedSequenceError:
|
|
seq_str = None
|
|
|
|
sql = (
|
|
"INSERT INTO biosequence (bioentry_id, version, "
|
|
"length, seq, alphabet) "
|
|
"VALUES (%s, 0, %s, %s, %s)"
|
|
)
|
|
self.adaptor.execute(sql, (bioentry_id, len(record.seq), seq_str, alphabet))
|
|
|
|
def _load_comment(self, record, bioentry_id):
|
|
"""Record a SeqRecord's annotated comment in the database (PRIVATE).
|
|
|
|
Arguments:
|
|
- record - a SeqRecord object with an annotated comment
|
|
- bioentry_id - corresponding database identifier
|
|
|
|
"""
|
|
comments = record.annotations.get("comment")
|
|
if not comments:
|
|
return
|
|
if not isinstance(comments, list):
|
|
# It should be a string then...
|
|
comments = [comments]
|
|
|
|
for index, comment in enumerate(comments):
|
|
comment = comment.replace("\n", " ")
|
|
# TODO - Store each line as a separate entry? This would preserve
|
|
# the newlines, but we should check BioPerl etc to be consistent.
|
|
sql = (
|
|
'INSERT INTO comment (bioentry_id, comment_text, "rank")'
|
|
" VALUES (%s, %s, %s)"
|
|
)
|
|
self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
|
|
|
|
def _load_annotations(self, record, bioentry_id):
|
|
"""Record a SeqRecord's misc annotations in the database (PRIVATE).
|
|
|
|
The annotation strings are recorded in the bioentry_qualifier_value
|
|
table, except for special cases like the reference, comment and
|
|
taxonomy which are handled with their own tables.
|
|
|
|
Arguments:
|
|
- record - a SeqRecord object with an annotations dictionary
|
|
- bioentry_id - corresponding database identifier
|
|
|
|
"""
|
|
mono_sql = (
|
|
"INSERT INTO bioentry_qualifier_value"
|
|
"(bioentry_id, term_id, value)"
|
|
" VALUES (%s, %s, %s)"
|
|
)
|
|
many_sql = (
|
|
"INSERT INTO bioentry_qualifier_value"
|
|
'(bioentry_id, term_id, value, "rank")'
|
|
" VALUES (%s, %s, %s, %s)"
|
|
)
|
|
tag_ontology_id = self._get_ontology_id("Annotation Tags")
|
|
for key, value in record.annotations.items():
|
|
if key in ["molecule_type", "references", "comment", "ncbi_taxid", "date"]:
|
|
# Handled separately
|
|
continue
|
|
term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
|
|
if isinstance(value, (list, tuple)):
|
|
rank = 0
|
|
for entry in value:
|
|
if isinstance(entry, (str, int)):
|
|
# Easy case
|
|
rank += 1
|
|
self.adaptor.execute(
|
|
many_sql, (bioentry_id, term_id, str(entry), rank)
|
|
)
|
|
else:
|
|
pass
|
|
elif isinstance(value, (str, int)):
|
|
# Have a simple single entry, leave rank as the DB default
|
|
self.adaptor.execute(mono_sql, (bioentry_id, term_id, str(value)))
|
|
else:
|
|
pass
|
|
# print("Ignoring annotation '%s' entry of type '%s'" \
|
|
# % (key, type(value)))
|
|
|
|
def _load_reference(self, reference, rank, bioentry_id):
|
|
"""Record SeqRecord's annotated references in the database (PRIVATE).
|
|
|
|
Arguments:
|
|
- record - a SeqRecord object with annotated references
|
|
- bioentry_id - corresponding database identifier
|
|
|
|
"""
|
|
refs = None
|
|
if reference.medline_id:
|
|
refs = self.adaptor.execute_and_fetch_col0(
|
|
"SELECT reference_id"
|
|
" FROM reference JOIN dbxref USING (dbxref_id)"
|
|
" WHERE dbname = 'MEDLINE' AND accession = %s",
|
|
(reference.medline_id,),
|
|
)
|
|
if not refs and reference.pubmed_id:
|
|
refs = self.adaptor.execute_and_fetch_col0(
|
|
"SELECT reference_id"
|
|
" FROM reference JOIN dbxref USING (dbxref_id)"
|
|
" WHERE dbname = 'PUBMED' AND accession = %s",
|
|
(reference.pubmed_id,),
|
|
)
|
|
if not refs:
|
|
s = []
|
|
for f in reference.authors, reference.title, reference.journal:
|
|
s.append(f or "<undef>")
|
|
crc = crc64("".join(s))
|
|
refs = self.adaptor.execute_and_fetch_col0(
|
|
"SELECT reference_id FROM reference WHERE crc = %s", (crc,)
|
|
)
|
|
if not refs:
|
|
if reference.medline_id:
|
|
dbxref_id = self._add_dbxref("MEDLINE", reference.medline_id, 0)
|
|
elif reference.pubmed_id:
|
|
dbxref_id = self._add_dbxref("PUBMED", reference.pubmed_id, 0)
|
|
else:
|
|
dbxref_id = None
|
|
authors = reference.authors or None
|
|
title = reference.title or None
|
|
# The location/journal field cannot be Null, so default
|
|
# to an empty string rather than None:
|
|
journal = reference.journal or ""
|
|
self.adaptor.execute(
|
|
"INSERT INTO reference (dbxref_id, location,"
|
|
" title, authors, crc)"
|
|
" VALUES (%s, %s, %s, %s, %s)",
|
|
(dbxref_id, journal, title, authors, crc),
|
|
)
|
|
reference_id = self.adaptor.last_id("reference")
|
|
else:
|
|
reference_id = refs[0]
|
|
|
|
if reference.location:
|
|
start = 1 + int(str(reference.location[0].start))
|
|
end = int(str(reference.location[0].end))
|
|
else:
|
|
start = None
|
|
end = None
|
|
|
|
sql = (
|
|
"INSERT INTO bioentry_reference (bioentry_id, reference_id,"
|
|
' start_pos, end_pos, "rank") VALUES (%s, %s, %s, %s, %s)'
|
|
)
|
|
self.adaptor.execute(sql, (bioentry_id, reference_id, start, end, rank + 1))
|
|
|
|
def _load_seqfeature(self, feature, feature_rank, bioentry_id):
|
|
"""Load a biopython SeqFeature into the database (PRIVATE)."""
|
|
# records loaded from a gff file using BCBio.GFF will contain value
|
|
# of 2nd column of the gff as a feature qualifier. The BioSQL wiki
|
|
# suggests that the source should not go in with the other feature
|
|
# mappings but instead be put in the term table
|
|
# (http://www.biosql.org/wiki/Annotation_Mapping)
|
|
try:
|
|
source = feature.qualifiers["source"]
|
|
if isinstance(source, list):
|
|
source = source[0]
|
|
seqfeature_id = self._load_seqfeature_basic(
|
|
feature.type, feature_rank, bioentry_id, source=source
|
|
)
|
|
except KeyError:
|
|
seqfeature_id = self._load_seqfeature_basic(
|
|
feature.type, feature_rank, bioentry_id
|
|
)
|
|
|
|
self._load_seqfeature_locations(feature, seqfeature_id)
|
|
self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
|
|
|
|
def _load_seqfeature_basic(
|
|
self, feature_type, feature_rank, bioentry_id, source="EMBL/GenBank/SwissProt"
|
|
):
|
|
"""Load the first tables of a seqfeature and returns the id (PRIVATE).
|
|
|
|
This loads the "key" of the seqfeature (ie. CDS, gene) and
|
|
the basic seqfeature table itself.
|
|
"""
|
|
ontology_id = self._get_ontology_id("SeqFeature Keys")
|
|
seqfeature_key_id = self._get_term_id(feature_type, ontology_id=ontology_id)
|
|
source_cat_id = self._get_ontology_id("SeqFeature Sources")
|
|
source_term_id = self._get_term_id(source, ontology_id=source_cat_id)
|
|
|
|
sql = (
|
|
"INSERT INTO seqfeature (bioentry_id, type_term_id, "
|
|
'source_term_id, "rank") VALUES (%s, %s, %s, %s)'
|
|
)
|
|
self.adaptor.execute(
|
|
sql, (bioentry_id, seqfeature_key_id, source_term_id, feature_rank + 1)
|
|
)
|
|
return self.adaptor.last_id("seqfeature")
|
|
|
|
def _load_seqfeature_locations(self, feature, seqfeature_id):
|
|
"""Load all of the locations for a SeqFeature into tables (PRIVATE).
|
|
|
|
This adds the locations related to the SeqFeature into the
|
|
seqfeature_location table. Fuzzies are not handled right now.
|
|
For a simple location, ie (1..2), we have a single table row
|
|
with seq_start = 1, seq_end = 2, location_rank = 1.
|
|
|
|
For split locations, ie (1..2, 3..4, 5..6) we would have three
|
|
row tables with::
|
|
|
|
start = 1, end = 2, rank = 1
|
|
start = 3, end = 4, rank = 2
|
|
start = 5, end = 6, rank = 3
|
|
|
|
"""
|
|
# TODO - Record an ontology for the locations (using location.term_id)
|
|
# which for now as in BioPerl we leave defaulting to NULL.
|
|
try:
|
|
if feature.location.operator != "join":
|
|
# e.g. order locations... we don't record "order" so it
|
|
# will become a "join" on reloading. What does BioPerl do?
|
|
import warnings
|
|
|
|
from Bio import BiopythonWarning
|
|
|
|
warnings.warn(
|
|
"%s location operators are not fully supported"
|
|
% feature.location_operator,
|
|
BiopythonWarning,
|
|
)
|
|
except AttributeError:
|
|
pass
|
|
# This will be a list of length one for a SimpleLocation:
|
|
parts = feature.location.parts
|
|
if parts and {loc.strand for loc in parts} == {-1}:
|
|
# To mimic prior behaviour of Biopython+BioSQL, reverse order
|
|
parts = parts[::-1]
|
|
# TODO - Check what BioPerl does; see also BioSeq.py code
|
|
for rank, loc in enumerate(parts):
|
|
self._insert_location(loc, rank + 1, seqfeature_id)
|
|
|
|
def _insert_location(self, location, rank, seqfeature_id):
|
|
"""Add SeqFeature location to seqfeature_location table (PRIVATE).
|
|
|
|
TODO - Add location operator to location_qualifier_value?
|
|
"""
|
|
# convert biopython locations to the 1-based location system
|
|
# used in bioSQL
|
|
# XXX This could also handle fuzzies
|
|
|
|
try:
|
|
start = int(location.start) + 1
|
|
except TypeError:
|
|
# Handle SwissProt unknown position (?)
|
|
if isinstance(location.start, UnknownPosition):
|
|
start = None
|
|
else:
|
|
raise
|
|
|
|
try:
|
|
end = int(location.end)
|
|
except TypeError:
|
|
# Handle SwissProt unknown position (?)
|
|
if isinstance(location.end, UnknownPosition):
|
|
end = None
|
|
else:
|
|
raise
|
|
|
|
# Biopython uses None when we don't know strand information but
|
|
# BioSQL requires something (non null) and sets this as zero
|
|
# So we'll use the strand or 0 if Biopython spits out None
|
|
strand = location.strand or 0
|
|
|
|
# TODO - Record an ontology term for the location (location.term_id)
|
|
# which for now like BioPerl we'll leave as NULL.
|
|
# This might allow us to record "between" positions properly, but I
|
|
# don't really see how it could work for before/after fuzzy positions
|
|
loc_term_id = None
|
|
|
|
if location.ref:
|
|
# sub_feature remote locations when they are in the same db as the
|
|
# current record do not have a value for ref_db, which SeqFeature
|
|
# object stores as None. BioSQL schema requires a varchar and is
|
|
# not NULL
|
|
dbxref_id = self._get_dbxref_id(location.ref_db or "", location.ref)
|
|
else:
|
|
dbxref_id = None
|
|
|
|
sql = (
|
|
"INSERT INTO location (seqfeature_id, dbxref_id, term_id,"
|
|
'start_pos, end_pos, strand, "rank") '
|
|
"VALUES (%s, %s, %s, %s, %s, %s, %s)"
|
|
)
|
|
self.adaptor.execute(
|
|
sql, (seqfeature_id, dbxref_id, loc_term_id, start, end, strand, rank)
|
|
)
|
|
|
|
"""
|
|
# See Bug 2677
|
|
# TODO - Record the location_operator (e.g. "join" or "order")
|
|
# using the location_qualifier_value table (which we and BioPerl
|
|
# have historically left empty).
|
|
# Note this will need an ontology term for the location qualifier
|
|
# (location_qualifier_value.term_id) for which oddly the schema
|
|
# does not allow NULL.
|
|
if feature.location_operator:
|
|
#e.g. "join" (common),
|
|
#or "order" (see Tests/GenBank/protein_refseq2.gb)
|
|
location_id = self.adaptor.last_id('location')
|
|
loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
|
|
sql = ("INSERT INTO location_qualifier_value"
|
|
"(location_id, term_id, value) "
|
|
"VALUES (%s, %s, %s)")
|
|
self.adaptor.execute(sql, (location_id, loc_qual_term_id,
|
|
feature.location_operator))
|
|
"""
|
|
|
|
def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
|
|
"""Insert feature's (key, value) pair qualifiers (PRIVATE).
|
|
|
|
Qualifiers should be a dictionary of the form::
|
|
|
|
{key : [value1, value2]}
|
|
|
|
"""
|
|
tag_ontology_id = self._get_ontology_id("Annotation Tags")
|
|
for qualifier_key in qualifiers:
|
|
# Treat db_xref qualifiers differently to sequence annotation
|
|
# qualifiers by populating the seqfeature_dbxref and dbxref
|
|
# tables. Other qualifiers go into the seqfeature_qualifier_value
|
|
# and (if new) term tables.
|
|
if qualifier_key != "db_xref":
|
|
qualifier_key_id = self._get_term_id(
|
|
qualifier_key, ontology_id=tag_ontology_id
|
|
)
|
|
# now add all of the values to their table
|
|
entries = qualifiers[qualifier_key]
|
|
if not isinstance(entries, list):
|
|
# Could be a plain string, or an int or a float.
|
|
# However, we exect a list of strings here.
|
|
entries = [entries]
|
|
for qual_value_rank in range(len(entries)):
|
|
qualifier_value = entries[qual_value_rank]
|
|
sql = (
|
|
"INSERT INTO seqfeature_qualifier_value "
|
|
' (seqfeature_id, term_id, "rank", value) VALUES'
|
|
" (%s, %s, %s, %s)"
|
|
)
|
|
self.adaptor.execute(
|
|
sql,
|
|
(
|
|
seqfeature_id,
|
|
qualifier_key_id,
|
|
qual_value_rank + 1,
|
|
qualifier_value,
|
|
),
|
|
)
|
|
else:
|
|
# The dbxref_id qualifier/value sets go into the dbxref table
|
|
# as dbname, accession, version tuples, with dbxref.dbxref_id
|
|
# being automatically assigned, and into the seqfeature_dbxref
|
|
# table as seqfeature_id, dbxref_id, and rank tuples
|
|
self._load_seqfeature_dbxref(qualifiers[qualifier_key], seqfeature_id)
|
|
|
|
def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
|
|
"""Add SeqFeature's DB cross-references to the database (PRIVATE).
|
|
|
|
Arguments:
|
|
- dbxrefs - List, dbxref data from the source file in the
|
|
format <database>:<accession>
|
|
- seqfeature_id - Int, the identifier for the seqfeature in the
|
|
seqfeature table
|
|
|
|
Insert dbxref qualifier data for a seqfeature into the
|
|
seqfeature_dbxref and, if required, dbxref tables.
|
|
The dbxref_id qualifier/value sets go into the dbxref table
|
|
as dbname, accession, version tuples, with dbxref.dbxref_id
|
|
being automatically assigned, and into the seqfeature_dbxref
|
|
table as seqfeature_id, dbxref_id, and rank tuples.
|
|
"""
|
|
# NOTE - In older versions of Biopython, we would map the GenBank
|
|
# db_xref "name", for example "GI" to "GeneIndex", and give a warning
|
|
# for any unknown terms. This was a long term maintenance problem,
|
|
# and differed from BioPerl and BioJava's implementation. See bug 2405
|
|
for rank, value in enumerate(dbxrefs):
|
|
# Split the DB:accession format string at colons. We have to
|
|
# account for multiple-line and multiple-accession entries
|
|
try:
|
|
dbxref_data = value.replace(" ", "").replace("\n", "").split(":")
|
|
db = dbxref_data[0]
|
|
accessions = dbxref_data[1:]
|
|
except Exception:
|
|
raise ValueError(f"Parsing of db_xref failed: '{value}'") from None
|
|
# Loop over all the grabbed accessions, and attempt to fill the
|
|
# table
|
|
for accession in accessions:
|
|
# Get the dbxref_id value for the dbxref data
|
|
dbxref_id = self._get_dbxref_id(db, accession)
|
|
# Insert the seqfeature_dbxref data
|
|
self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
|
|
|
|
def _get_dbxref_id(self, db, accession):
|
|
"""Get DB cross-reference for accession (PRIVATE).
|
|
|
|
Arguments:
|
|
- db - String, the name of the external database containing
|
|
the accession number
|
|
- accession - String, the accession of the dbxref data
|
|
|
|
Finds and returns the dbxref_id for the passed data. The method
|
|
attempts to find an existing record first, and inserts the data
|
|
if there is no record.
|
|
"""
|
|
# Check for an existing record
|
|
sql = "SELECT dbxref_id FROM dbxref WHERE dbname = %s AND accession = %s"
|
|
dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
|
|
# If there was a record, return the dbxref_id, else create the
|
|
# record and return the created dbxref_id
|
|
if dbxref_id:
|
|
return dbxref_id[0]
|
|
return self._add_dbxref(db, accession, 0)
|
|
|
|
def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
|
|
"""Get DB cross-reference, creating it if needed (PRIVATE).
|
|
|
|
Check for a pre-existing seqfeature_dbxref entry with the passed
|
|
seqfeature_id and dbxref_id. If one does not exist, insert new
|
|
data.
|
|
"""
|
|
# Check for an existing record
|
|
sql = (
|
|
"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref "
|
|
"WHERE seqfeature_id = %s AND dbxref_id = %s"
|
|
)
|
|
result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, dbxref_id))
|
|
# If there was a record, return without executing anything, else create
|
|
# the record and return
|
|
if result:
|
|
return result
|
|
return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
|
|
|
|
def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
|
|
"""Add DB cross-reference (PRIVATE).
|
|
|
|
Insert a seqfeature_dbxref row and return the seqfeature_id and
|
|
dbxref_id
|
|
"""
|
|
sql = (
|
|
"INSERT INTO seqfeature_dbxref "
|
|
'(seqfeature_id, dbxref_id, "rank") VALUES'
|
|
"(%s, %s, %s)"
|
|
)
|
|
self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
|
|
return (seqfeature_id, dbxref_id)
|
|
|
|
def _load_dbxrefs(self, record, bioentry_id):
|
|
"""Load any sequence level cross references into the database (PRIVATE).
|
|
|
|
See table bioentry_dbxref.
|
|
"""
|
|
for rank, value in enumerate(record.dbxrefs):
|
|
# Split the DB:accession string at first colon.
|
|
# We have to cope with things like:
|
|
# "MGD:MGI:892" (db="MGD", accession="MGI:892")
|
|
# "GO:GO:123" (db="GO", accession="GO:123")
|
|
#
|
|
# Annoyingly I have seen the NCBI use both the style
|
|
# "GO:GO:123" and "GO:123" in different vintages.
|
|
newline_escape_count = value.count("\n")
|
|
if newline_escape_count != 0:
|
|
raise ValueError(
|
|
"Expected a single line in value, got {newline_escape_count}"
|
|
)
|
|
try:
|
|
db, accession = value.split(":", 1)
|
|
db = db.strip()
|
|
accession = accession.strip()
|
|
except Exception:
|
|
raise ValueError(f"Parsing of dbxrefs list failed: '{value}'") from None
|
|
# Get the dbxref_id value for the dbxref data
|
|
dbxref_id = self._get_dbxref_id(db, accession)
|
|
# Insert the bioentry_dbxref data
|
|
self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
|
|
|
|
def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
|
|
"""Get pre-existing db-xref, or create and return it (PRIVATE).
|
|
|
|
Check for a pre-existing bioentry_dbxref entry with the passed
|
|
seqfeature_id and dbxref_id. If one does not exist, insert new
|
|
data
|
|
"""
|
|
# Check for an existing record
|
|
sql = (
|
|
"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref "
|
|
"WHERE bioentry_id = %s AND dbxref_id = %s"
|
|
)
|
|
result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, dbxref_id))
|
|
# If there was a record, return without executing anything, else create
|
|
# the record and return
|
|
if result:
|
|
return result
|
|
return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
|
|
|
|
def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
|
|
"""Insert a bioentry_dbxref row (PRIVATE).
|
|
|
|
Returns the seqfeature_id and dbxref_id (PRIVATE).
|
|
"""
|
|
sql = (
|
|
"INSERT INTO bioentry_dbxref "
|
|
'(bioentry_id,dbxref_id,"rank") VALUES '
|
|
"(%s, %s, %s)"
|
|
)
|
|
self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
|
|
return (bioentry_id, dbxref_id)
|
|
|
|
|
|
class DatabaseRemover:
|
|
"""Complement the Loader functionality by fully removing a database.
|
|
|
|
This probably isn't really useful for normal purposes, since you
|
|
can just do a::
|
|
|
|
DROP DATABASE db_name
|
|
|
|
and then recreate the database. But, it's really useful for testing
|
|
purposes.
|
|
"""
|
|
|
|
def __init__(self, adaptor, dbid):
|
|
"""Initialize with a database id and adaptor connection."""
|
|
self.adaptor = adaptor
|
|
self.dbid = dbid
|
|
|
|
def remove(self):
|
|
"""Remove everything related to the given database id."""
|
|
sql = "DELETE FROM bioentry WHERE biodatabase_id = %s"
|
|
self.adaptor.execute(sql, (self.dbid,))
|
|
sql = "DELETE FROM biodatabase WHERE biodatabase_id = %s"
|
|
self.adaptor.execute(sql, (self.dbid,))
|