mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
850 lines
32 KiB
Python
850 lines
32 KiB
Python
# Copyright 2007 by Michiel de Hoon. All rights reserved.
|
|
# This code is part of the Biopython distribution and governed by its
|
|
# license. Please see the LICENSE file that should have been included
|
|
# as part of this package.
|
|
"""Code to work with the sprotXX.dat file from SwissProt.
|
|
|
|
https://web.expasy.org/docs/userman.html
|
|
|
|
Classes:
|
|
- Record Holds SwissProt data.
|
|
- Reference Holds reference data from a SwissProt record.
|
|
|
|
Functions:
|
|
- read Read one SwissProt record
|
|
- parse Read multiple SwissProt records
|
|
|
|
"""
|
|
|
|
import io
|
|
import re
|
|
|
|
from Bio.SeqFeature import Position
|
|
from Bio.SeqFeature import SeqFeature
|
|
from Bio.SeqFeature import SimpleLocation
|
|
|
|
|
|
class SwissProtParserError(ValueError):
|
|
"""An error occurred while parsing a SwissProt file."""
|
|
|
|
def __init__(self, *args, line=None):
|
|
"""Create a SwissProtParserError object with the offending line."""
|
|
super().__init__(*args)
|
|
self.line = line
|
|
|
|
|
|
class Record:
|
|
"""Holds information from a SwissProt record.
|
|
|
|
Attributes:
|
|
- entry_name Name of this entry, e.g. RL1_ECOLI.
|
|
- data_class Either 'STANDARD' or 'PRELIMINARY'.
|
|
- molecule_type Type of molecule, 'PRT',
|
|
- sequence_length Number of residues.
|
|
- accessions List of the accession numbers, e.g. ['P00321']
|
|
- created A tuple of (date, release).
|
|
- sequence_update A tuple of (date, release).
|
|
- annotation_update A tuple of (date, release).
|
|
- description Free-format description.
|
|
- gene_name A list of dictionaries with keys 'Name', 'Synonyms',
|
|
'OrderedLocusNames' and 'ORFNames'.
|
|
- organism The source of the sequence.
|
|
- organelle The origin of the sequence.
|
|
- organism_classification The taxonomy classification. List of strings.
|
|
(http://www.ncbi.nlm.nih.gov/Taxonomy/)
|
|
- taxonomy_id A list of NCBI taxonomy id's.
|
|
- host_organism A list of names of the hosts of a virus, if any.
|
|
- host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any.
|
|
- references List of Reference objects.
|
|
- comments List of strings.
|
|
- cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
|
|
- keywords List of the keywords.
|
|
- features List of tuples (key name, from, to, description).
|
|
from and to can be either integers for the residue
|
|
numbers, '<', '>', or '?'
|
|
- protein_existence Numerical value describing the evidence for the existence of the protein.
|
|
- seqinfo tuple of (length, molecular weight, CRC32 value)
|
|
- sequence The sequence.
|
|
|
|
Examples
|
|
--------
|
|
>>> from Bio import SwissProt
|
|
>>> example_filename = "SwissProt/P68308.txt"
|
|
>>> with open(example_filename) as handle:
|
|
... records = SwissProt.parse(handle)
|
|
... for record in records:
|
|
... print(record.entry_name)
|
|
... print(record.accessions)
|
|
... print(record.keywords)
|
|
... print(record.organism)
|
|
... print(record.sequence[:20] + "...")
|
|
...
|
|
NU3M_BALPH
|
|
['P68308', 'P24973']
|
|
['Electron transport', 'Membrane', 'Mitochondrion', 'Mitochondrion inner membrane', 'NAD', 'Respiratory chain', 'Translocase', 'Transmembrane', 'Transmembrane helix', 'Transport', 'Ubiquinone']
|
|
Balaenoptera physalus (Fin whale) (Balaena physalus).
|
|
MNLLLTLLTNTTLALLLVFI...
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the class."""
|
|
self.entry_name = None
|
|
self.data_class = None
|
|
self.molecule_type = None
|
|
self.sequence_length = None
|
|
|
|
self.accessions = []
|
|
self.created = None
|
|
self.sequence_update = None
|
|
self.annotation_update = None
|
|
|
|
self.description = []
|
|
self.gene_name = []
|
|
self.organism = []
|
|
self.organelle = ""
|
|
self.organism_classification = []
|
|
self.taxonomy_id = []
|
|
self.host_organism = []
|
|
self.host_taxonomy_id = []
|
|
self.references = []
|
|
self.comments = []
|
|
self.cross_references = []
|
|
self.keywords = []
|
|
self.features = []
|
|
self.protein_existence = ""
|
|
|
|
self.seqinfo = None
|
|
self.sequence = ""
|
|
|
|
|
|
class Reference:
|
|
"""Holds information from one reference in a SwissProt entry.
|
|
|
|
Attributes:
|
|
- number Number of reference in an entry.
|
|
- evidence Evidence code. List of strings.
|
|
- positions Describes extent of work. List of strings.
|
|
- comments Comments. List of (token, text).
|
|
- references References. List of (dbname, identifier).
|
|
- authors The authors of the work.
|
|
- title Title of the work.
|
|
- location A citation for the work.
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the class."""
|
|
self.number = None
|
|
self.positions = []
|
|
self.comments = []
|
|
self.references = []
|
|
self.authors = []
|
|
self.title = []
|
|
self.location = []
|
|
|
|
|
|
class FeatureTable(SeqFeature):
|
|
"""Stores feature annotations for specific regions of the sequence.
|
|
|
|
This is a subclass of SeqFeature, defined in Bio.SeqFeature, where the
|
|
attributes are used as follows:
|
|
|
|
- ``location``: location of the feature on the canonical or isoform
|
|
sequence; the location is stored as an instance of SimpleLocation,
|
|
defined in Bio.SeqFeature, with the ref attribute set to the isoform
|
|
ID referring to the canonical or isoform sequence on which the feature
|
|
is defined
|
|
- ``id``: unique and stable identifier (FTId), only provided for features
|
|
belonging to the types CARBOHYD, CHAIN, PEPTIDE, PROPEP, VARIANT, or
|
|
VAR_SEQ
|
|
- ``type``: indicates the type of feature, as defined by the UniProt
|
|
Knowledgebase documentation:
|
|
|
|
- ACT_SITE: amino acid(s) involved in the activity of an enzyme
|
|
- BINDING: binding site for any chemical group
|
|
- CARBOHYD: glycosylation site; an FTId identifier to the GlyConnect
|
|
database is provided if annotated there
|
|
- CA_BIND: calcium-binding region
|
|
- CHAIN: polypeptide chain in the mature protein
|
|
- COILED: coiled-coil region
|
|
- COMPBIAS: compositionally biased region
|
|
- CONFLICT: different sources report differing sequences
|
|
- CROSSLNK: posttransationally formed amino acid bond
|
|
- DISULFID: disulfide bond
|
|
- DNA_BIND: DNA-binding region
|
|
- DOMAIN: domain, defined as a specific combination of secondary
|
|
structures organized into a characteristic three-dimensional
|
|
structure or fold
|
|
- INIT_MET: initiator methionine
|
|
- INTRAMEM: region located in a membrane without crossing it
|
|
- HELIX: alpha-, 3(10)-, or pi-helix secondary structure
|
|
- LIPID: covalent binding of a lipid moiety
|
|
- METAL: binding site for a metal ion
|
|
- MOD_RES: posttranslational modification (PTM) of a residue,
|
|
annotated by the controlled vocabulary defined by the ptmlist.txt
|
|
document on the UniProt website
|
|
- MOTIF: short sequence motif of biological interest
|
|
- MUTAGEN: site experimentally altered by mutagenesis
|
|
- NON_CONS: non-consecutive residues
|
|
- NON_STD: non-standard amino acid
|
|
- NON_TER: the residue at an extremity of the sequence is not the
|
|
terminal residue
|
|
- NP_BIND: nucleotide phosphate-binding region
|
|
- PEPTIDE: released active mature polypeptide
|
|
- PROPEP: any processed propeptide
|
|
- REGION: region of interest in the sequence
|
|
- REPEAT: internal sequence repetition
|
|
- SIGNAL: signal sequence (prepeptide)
|
|
- SITE: amino-acid site of interest not represented by another
|
|
feature key
|
|
- STRAND: beta-strand secondary structure; either a hydrogen-bonded
|
|
extended beta strand or a residue in an isolated beta-bridge
|
|
- TOPO_DOM: topological domain
|
|
- TRANSIT: transit peptide (mitochondrion, chloroplast, thylakoid,
|
|
cyanelle, peroxisome, etc.)
|
|
- TRANSMEM: transmembrane region
|
|
- TURN: H-bonded turn (3-, 4-, or 5-turn)
|
|
- UNSURE: uncertainties in the sequence
|
|
- VARIANT: sequence variant; an FTId is provided for protein sequence
|
|
variants of Hominidae (great apes and humans)
|
|
- VAR_SEQ: sequence variant produced by alternative splicing,
|
|
alternative promoter usage, alternative initiation, or ribosomal
|
|
frameshifting
|
|
- ZN_FING: zinc finger region
|
|
|
|
- ``qualifiers``: a dictionary of additional information, which may include
|
|
the feature evidence and free-text notes. While SwissProt includes the
|
|
feature identifier code (FTId) as a qualifier, it is stored as the
|
|
attribute ID of the FeatureTable object.
|
|
|
|
"""
|
|
|
|
|
|
def parse(source):
|
|
"""Read multiple SwissProt records from file.
|
|
|
|
Argument source is a file-like object or a path to a file.
|
|
|
|
Returns a generator object which yields Bio.SwissProt.Record() objects.
|
|
"""
|
|
handle = _open(source)
|
|
try:
|
|
while True:
|
|
record = _read(handle)
|
|
if not record:
|
|
return
|
|
yield record
|
|
finally:
|
|
if handle is not source:
|
|
handle.close()
|
|
|
|
|
|
def read(source):
|
|
"""Read one SwissProt record from file.
|
|
|
|
Argument source is a file-like object or a path to a file.
|
|
|
|
Returns a Record() object.
|
|
"""
|
|
handle = _open(source)
|
|
try:
|
|
record = _read(handle)
|
|
if not record:
|
|
raise ValueError("No SwissProt record found")
|
|
# We should have reached the end of the record by now.
|
|
# Try to read one more line to be sure:
|
|
try:
|
|
next(handle)
|
|
except StopIteration:
|
|
return record
|
|
raise ValueError("More than one SwissProt record found")
|
|
finally:
|
|
if handle is not source:
|
|
handle.close()
|
|
|
|
|
|
# Everything below is considered private
|
|
|
|
|
|
def _open(source):
|
|
try:
|
|
handle = open(source)
|
|
return handle
|
|
except TypeError:
|
|
handle = source
|
|
if handle.read(0) == "":
|
|
# handle is text; assume the encoding is compatible with ASCII
|
|
return handle
|
|
# handle is binary; SwissProt encoding is always ASCII
|
|
return io.TextIOWrapper(handle, encoding="ASCII")
|
|
|
|
|
|
def _read(handle):
|
|
record = None
|
|
unread = ""
|
|
try:
|
|
line = next(handle)
|
|
except StopIteration:
|
|
return record
|
|
key, value = line[:2], line[5:].rstrip()
|
|
if key != "ID":
|
|
raise SwissProtParserError("Failed to find ID in first line", line=line)
|
|
record = Record()
|
|
_read_id(record, line)
|
|
_sequence_lines = []
|
|
for line in handle:
|
|
key, value = line[:2], line[5:].rstrip()
|
|
if unread:
|
|
value = unread + " " + value
|
|
unread = ""
|
|
if key == "AC":
|
|
accessions = value.rstrip(";").split("; ")
|
|
record.accessions.extend(accessions)
|
|
elif key == "DT":
|
|
_read_dt(record, line)
|
|
elif key == "DE":
|
|
record.description.append(value.strip())
|
|
elif key == "GN":
|
|
if value == "and":
|
|
record.gene_name.append("")
|
|
else:
|
|
if len(record.gene_name) == 0:
|
|
record.gene_name.append("")
|
|
record.gene_name[-1] += value + " "
|
|
elif key == "OS":
|
|
record.organism.append(value)
|
|
elif key == "OG":
|
|
record.organelle += line[5:]
|
|
elif key == "OC":
|
|
cols = value.rstrip(";.").split("; ")
|
|
record.organism_classification.extend(cols)
|
|
elif key == "OX":
|
|
_read_ox(record, line)
|
|
elif key == "OH":
|
|
_read_oh(record, line)
|
|
elif key == "RN":
|
|
reference = Reference()
|
|
_read_rn(reference, value)
|
|
record.references.append(reference)
|
|
elif key == "RP":
|
|
assert record.references, "RP: missing RN"
|
|
record.references[-1].positions.append(value)
|
|
elif key == "RC":
|
|
assert record.references, "RC: missing RN"
|
|
reference = record.references[-1]
|
|
unread = _read_rc(reference, value)
|
|
elif key == "RX":
|
|
assert record.references, "RX: missing RN"
|
|
reference = record.references[-1]
|
|
_read_rx(reference, value)
|
|
elif key == "RL":
|
|
assert record.references, "RL: missing RN"
|
|
reference = record.references[-1]
|
|
reference.location.append(value)
|
|
# In UniProt release 1.12 of 6/21/04, there is a new RG
|
|
# (Reference Group) line, which references a group instead of
|
|
# an author. Each block must have at least 1 RA or RG line.
|
|
elif key == "RA":
|
|
assert record.references, "RA: missing RN"
|
|
reference = record.references[-1]
|
|
reference.authors.append(value)
|
|
elif key == "RG":
|
|
assert record.references, "RG: missing RN"
|
|
reference = record.references[-1]
|
|
reference.authors.append(value)
|
|
elif key == "RT":
|
|
assert record.references, "RT: missing RN"
|
|
reference = record.references[-1]
|
|
reference.title.append(value)
|
|
elif key == "CC":
|
|
_read_cc(record, line)
|
|
elif key == "DR":
|
|
_read_dr(record, value)
|
|
elif key == "PE":
|
|
_read_pe(record, value)
|
|
elif key == "KW":
|
|
_read_kw(record, value)
|
|
elif key == "FT":
|
|
_read_ft(record, line)
|
|
elif key == "SQ":
|
|
cols = value.split()
|
|
assert len(cols) == 7, f"I don't understand SQ line {line}"
|
|
# Do more checking here?
|
|
record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
|
|
elif key == " ":
|
|
_sequence_lines.append(value.replace(" ", "").rstrip())
|
|
elif key == "//":
|
|
_read_gn(record)
|
|
# Join multiline data into one string
|
|
record.description = " ".join(record.description)
|
|
record.organism = " ".join(record.organism)
|
|
record.organelle = record.organelle.rstrip()
|
|
for reference in record.references:
|
|
reference.authors = " ".join(reference.authors).rstrip(";")
|
|
if reference.title:
|
|
title = reference.title[0]
|
|
for fragment in reference.title[1:]:
|
|
if not title.endswith("-"):
|
|
title += " "
|
|
title += fragment
|
|
title = title.rstrip(";")
|
|
if title.startswith('"') and title.endswith('"'):
|
|
title = title[1:-1] # remove quotes
|
|
else:
|
|
title = ""
|
|
reference.title = title
|
|
reference.location = " ".join(reference.location)
|
|
record.sequence = "".join(_sequence_lines)
|
|
return record
|
|
elif key == "**":
|
|
# Do this one last, as it will almost never occur.
|
|
# See Bug 2353, some files from the EBI have extra lines
|
|
# starting "**" (two asterisks/stars). They appear
|
|
# to be unofficial automated annotations. e.g.
|
|
# **
|
|
# ** ################# INTERNAL SECTION ##################
|
|
# **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
|
|
pass
|
|
else:
|
|
raise SwissProtParserError(f"Unknown keyword {key!r} found", line=line)
|
|
if record:
|
|
raise ValueError("Unexpected end of stream.")
|
|
|
|
|
|
def _read_gn(record):
|
|
for i, text in enumerate(record.gene_name):
|
|
tokens = text.rstrip("; ").split("; ")
|
|
gene_name = {}
|
|
for token in tokens:
|
|
# value may include an equals sign, e.g.
|
|
# GN Name=Lacc1=POX4 {ECO:0000313|EMBL:KDQ27217.1};
|
|
key, value = token.strip().split("=", 1)
|
|
if key == "Name":
|
|
gene_name["Name"] = value
|
|
else:
|
|
assert key in ("Synonyms", "OrderedLocusNames", "ORFNames")
|
|
gene_name[key] = value.split(", ")
|
|
record.gene_name[i] = gene_name
|
|
|
|
|
|
def _read_id(record, line):
|
|
cols = line[5:].split()
|
|
# Prior to release 51, included with MoleculeType:
|
|
# ID EntryName DataClass; MoleculeType; SequenceLength AA.
|
|
#
|
|
# Newer files lack the MoleculeType:
|
|
# ID EntryName DataClass; SequenceLength AA.
|
|
if len(cols) == 5:
|
|
record.entry_name = cols[0]
|
|
record.data_class = cols[1].rstrip(";")
|
|
record.molecule_type = cols[2].rstrip(";")
|
|
record.sequence_length = int(cols[3])
|
|
elif len(cols) == 4:
|
|
record.entry_name = cols[0]
|
|
record.data_class = cols[1].rstrip(";")
|
|
record.molecule_type = None
|
|
record.sequence_length = int(cols[2])
|
|
else:
|
|
raise SwissProtParserError("ID line has unrecognised format", line=line)
|
|
# check if the data class is one of the allowed values
|
|
allowed = ("STANDARD", "PRELIMINARY", "IPI", "Reviewed", "Unreviewed")
|
|
if record.data_class not in allowed:
|
|
message = f"Unrecognized data class {record.data_class!r}"
|
|
raise SwissProtParserError(message, line=line)
|
|
|
|
# molecule_type should be 'PRT' for PRoTein
|
|
# Note that has been removed in recent releases (set to None)
|
|
if record.molecule_type not in (None, "PRT"):
|
|
message = f"Unrecognized molecule type {record.molecule_type!r}"
|
|
raise SwissProtParserError(message, line=line)
|
|
|
|
|
|
def _read_dt(record, line):
|
|
value = line[5:]
|
|
uprline = value.upper()
|
|
cols = value.rstrip().split()
|
|
if (
|
|
"CREATED" in uprline
|
|
or "LAST SEQUENCE UPDATE" in uprline
|
|
or "LAST ANNOTATION UPDATE" in uprline
|
|
):
|
|
# Old style DT line
|
|
# =================
|
|
# e.g.
|
|
# DT 01-FEB-1995 (Rel. 31, Created)
|
|
# DT 01-FEB-1995 (Rel. 31, Last sequence update)
|
|
# DT 01-OCT-2000 (Rel. 40, Last annotation update)
|
|
#
|
|
# or:
|
|
# DT 08-JAN-2002 (IPI Human rel. 2.3, Created)
|
|
# ...
|
|
|
|
# find where the version information will be located
|
|
# This is needed for when you have cases like IPI where
|
|
# the release version is in a different spot:
|
|
# DT 08-JAN-2002 (IPI Human rel. 2.3, Created)
|
|
uprcols = uprline.split()
|
|
rel_index = -1
|
|
for index in range(len(uprcols)):
|
|
if "REL." in uprcols[index]:
|
|
rel_index = index
|
|
assert rel_index >= 0, f"Could not find Rel. in DT line: {line}"
|
|
version_index = rel_index + 1
|
|
# get the version information
|
|
str_version = cols[version_index].rstrip(",")
|
|
# no version number
|
|
if str_version == "":
|
|
version = 0
|
|
# dot versioned
|
|
elif "." in str_version:
|
|
version = str_version
|
|
# integer versioned
|
|
else:
|
|
version = int(str_version)
|
|
date = cols[0]
|
|
|
|
if "CREATED" in uprline:
|
|
record.created = date, version
|
|
elif "LAST SEQUENCE UPDATE" in uprline:
|
|
record.sequence_update = date, version
|
|
elif "LAST ANNOTATION UPDATE" in uprline:
|
|
record.annotation_update = date, version
|
|
else:
|
|
raise SwissProtParserError("Unrecognised DT (DaTe) line", line=line)
|
|
elif (
|
|
"INTEGRATED INTO" in uprline
|
|
or "SEQUENCE VERSION" in uprline
|
|
or "ENTRY VERSION" in uprline
|
|
):
|
|
# New style DT line
|
|
# =================
|
|
# As of UniProt Knowledgebase release 7.0 (including
|
|
# Swiss-Prot release 49.0 and TrEMBL release 32.0) the
|
|
# format of the DT lines and the version information
|
|
# in them was changed - the release number was dropped.
|
|
#
|
|
# For more information see bug 1948 and
|
|
# http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0
|
|
#
|
|
# e.g.
|
|
# DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot.
|
|
# DT 15-OCT-2001, sequence version 3.
|
|
# DT 01-APR-2004, entry version 14.
|
|
#
|
|
# This is a new style DT line...
|
|
|
|
# The date should be in string cols[1]
|
|
# Get the version number if there is one.
|
|
# For the three DT lines above: 0, 3, 14
|
|
try:
|
|
version = 0
|
|
for s in cols[-1].split("."):
|
|
if s.isdigit():
|
|
version = int(s)
|
|
except ValueError:
|
|
version = 0
|
|
date = cols[0].rstrip(",")
|
|
|
|
# Reuse the historical property names, even though
|
|
# the meaning has changed slightly:
|
|
if "INTEGRATED" in uprline:
|
|
record.created = date, version
|
|
elif "SEQUENCE VERSION" in uprline:
|
|
record.sequence_update = date, version
|
|
elif "ENTRY VERSION" in uprline:
|
|
record.annotation_update = date, version
|
|
else:
|
|
raise SwissProtParserError("Unrecognised DT (DaTe) line", line=line)
|
|
else:
|
|
raise SwissProtParserError("Failed to parse DT (DaTe) line", line=line)
|
|
|
|
|
|
def _read_ox(record, line):
|
|
# The OX line used to be in the simple format:
|
|
# OX DESCRIPTION=ID[, ID]...;
|
|
# If there are too many id's to fit onto a line, then the ID's
|
|
# continue directly onto the next line, e.g.
|
|
# OX DESCRIPTION=ID[, ID]...
|
|
# OX ID[, ID]...;
|
|
# Currently, the description is always "NCBI_TaxID".
|
|
# To parse this, I need to check to see whether I'm at the
|
|
# first line. If I am, grab the description and make sure
|
|
# it's an NCBI ID. Then, grab all the id's.
|
|
#
|
|
# As of the 2014-10-01 release, there may be an evidence code, e.g.
|
|
# OX NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1};
|
|
# In the short term, we will ignore any evidence codes:
|
|
line = line.split("{")[0]
|
|
if record.taxonomy_id:
|
|
ids = line[5:].rstrip().rstrip(";")
|
|
else:
|
|
descr, ids = line[5:].rstrip().rstrip(";").split("=")
|
|
assert descr == "NCBI_TaxID", f"Unexpected taxonomy type {descr}"
|
|
record.taxonomy_id.extend(ids.split(", "))
|
|
|
|
|
|
def _read_oh(record, line):
|
|
# Line type OH (Organism Host) for viral hosts
|
|
assert line[5:].startswith("NCBI_TaxID="), f"Unexpected {line}"
|
|
line = line[16:].rstrip()
|
|
assert line[-1] == "." and line.count(";") == 1, line
|
|
taxid, name = line[:-1].split(";")
|
|
record.host_taxonomy_id.append(taxid.strip())
|
|
record.host_organism.append(name.strip())
|
|
|
|
|
|
def _read_rn(reference, rn):
|
|
# This used to be a very simple line with a reference number, e.g.
|
|
# RN [1]
|
|
# As of the 2014-10-01 release, there may be an evidence code, e.g.
|
|
# RN [1] {ECO:0000313|EMBL:AEX14553.1}
|
|
words = rn.split(None, 1)
|
|
number = words[0]
|
|
assert number.startswith("[") and number.endswith("]"), f"Missing brackets {number}"
|
|
reference.number = int(number[1:-1])
|
|
if len(words) > 1:
|
|
evidence = words[1]
|
|
assert evidence.startswith("{") and evidence.endswith(
|
|
"}"
|
|
), f"Missing braces {evidence}"
|
|
reference.evidence = evidence[1:-1].split("|")
|
|
|
|
|
|
def _read_rc(reference, value):
|
|
cols = value.split(";")
|
|
if value[-1] == ";":
|
|
unread = ""
|
|
else:
|
|
cols, unread = cols[:-1], cols[-1]
|
|
for col in cols:
|
|
if not col: # last column will be the empty string
|
|
return
|
|
# The token is everything before the first '=' character.
|
|
i = col.find("=")
|
|
if i >= 0:
|
|
token, text = col[:i], col[i + 1 :]
|
|
comment = token.lstrip(), text
|
|
reference.comments.append(comment)
|
|
else:
|
|
comment = reference.comments[-1]
|
|
comment = f"{comment} {col}"
|
|
reference.comments[-1] = comment
|
|
return unread
|
|
|
|
|
|
def _read_rx(reference, value):
|
|
# The basic (older?) RX line is of the form:
|
|
# RX MEDLINE; 85132727.
|
|
# but there are variants of this that need to be dealt with (see below)
|
|
|
|
# CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33
|
|
# have extraneous information in the RX line. Check for
|
|
# this and chop it out of the line.
|
|
# (noticed by katel@worldpath.net)
|
|
value = value.replace(" [NCBI, ExPASy, Israel, Japan]", "")
|
|
|
|
# RX lines can also be used of the form
|
|
# RX PubMed=9603189;
|
|
# reported by edvard@farmasi.uit.no
|
|
# and these can be more complicated like:
|
|
# RX MEDLINE=95385798; PubMed=7656980;
|
|
# RX PubMed=15060122; DOI=10.1136/jmg 2003.012781;
|
|
# We look for these cases first and deal with them
|
|
warn = False
|
|
if "=" in value:
|
|
cols = value.split("; ")
|
|
cols = [x.strip() for x in cols]
|
|
cols = [x for x in cols if x]
|
|
for col in cols:
|
|
x = col.split("=")
|
|
if len(x) != 2 or x == ("DOI", "DOI"):
|
|
warn = True
|
|
break
|
|
assert len(x) == 2, f"I don't understand RX line {value}"
|
|
reference.references.append((x[0], x[1].rstrip(";")))
|
|
# otherwise we assume we have the type 'RX MEDLINE; 85132727.'
|
|
else:
|
|
cols = value.split("; ")
|
|
# normally we split into the three parts
|
|
if len(cols) != 2:
|
|
warn = True
|
|
else:
|
|
reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
|
|
if warn:
|
|
import warnings
|
|
|
|
from Bio import BiopythonParserWarning
|
|
|
|
warnings.warn(f"Possibly corrupt RX line {value!r}", BiopythonParserWarning)
|
|
|
|
|
|
def _read_cc(record, line):
|
|
key, value = line[5:8], line[9:].rstrip()
|
|
if key == "-!-": # Make a new comment
|
|
record.comments.append(value)
|
|
elif key == " ": # add to the previous comment
|
|
if not record.comments:
|
|
# TCMO_STRGA in Release 37 has comment with no topic
|
|
record.comments.append(value)
|
|
else:
|
|
record.comments[-1] += " " + value
|
|
|
|
|
|
def _read_dr(record, value):
|
|
cols = value.rstrip(".").split("; ")
|
|
record.cross_references.append(tuple(cols))
|
|
|
|
|
|
def _read_pe(record, value):
|
|
pe = value.split(":")
|
|
record.protein_existence = int(pe[0])
|
|
|
|
|
|
def _read_kw(record, value):
|
|
# Old style - semi-colon separated, multi-line. e.g. Q13639.txt
|
|
# KW Alternative splicing; Cell membrane; Complete proteome;
|
|
# KW Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein;
|
|
# KW Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer;
|
|
# KW Transmembrane.
|
|
#
|
|
# New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt
|
|
# KW Monooxygenase {ECO:0000313|EMBL:AEX14553.1};
|
|
# KW Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}.
|
|
# For now to match the XML parser, drop the evidence codes.
|
|
for val in value.rstrip(";.").split("; "):
|
|
if val.endswith("}"):
|
|
# Discard the evidence code
|
|
val = val.rsplit("{", 1)[0]
|
|
record.keywords.append(val.strip())
|
|
|
|
|
|
def _read_ft(record, line):
|
|
name = line[5:13].rstrip()
|
|
if name:
|
|
if line[13:21] == " ": # new-style FT line
|
|
location = line[21:80].rstrip()
|
|
try:
|
|
isoform_id, location = location.split(":")
|
|
except ValueError:
|
|
isoform_id = None
|
|
try:
|
|
from_res, to_res = location.split("..")
|
|
except ValueError:
|
|
from_res = location
|
|
to_res = ""
|
|
qualifiers = {}
|
|
else: # old-style FT line
|
|
from_res = line[14:20].lstrip()
|
|
to_res = line[21:27].lstrip()
|
|
isoform_id = None
|
|
description = line[34:75].rstrip()
|
|
qualifiers = {"description": description}
|
|
from_res = Position.fromstring(from_res, -1)
|
|
if to_res == "":
|
|
to_res = from_res + 1
|
|
else:
|
|
to_res = Position.fromstring(to_res)
|
|
location = SimpleLocation(from_res, to_res, ref=isoform_id)
|
|
feature = FeatureTable(
|
|
location=location, type=name, id=None, qualifiers=qualifiers
|
|
)
|
|
record.features.append(feature)
|
|
return
|
|
# this line is a continuation of the previous feature
|
|
feature = record.features[-1]
|
|
if line[5:34] == " ": # old-style FT line
|
|
description = line[34:75].rstrip()
|
|
if description.startswith("/FTId="):
|
|
# store the FTId as the feature ID
|
|
feature.id = description[6:].rstrip(".")
|
|
return
|
|
# this line is a continuation of the description of the previous feature
|
|
old_description = feature.qualifiers["description"]
|
|
if old_description.endswith("-"):
|
|
description = f"{old_description}{description}"
|
|
else:
|
|
description = f"{old_description} {description}"
|
|
|
|
if feature.type in ("VARSPLIC", "VAR_SEQ"): # special case
|
|
# Remove unwanted spaces in sequences.
|
|
# During line carryover, the sequences in VARSPLIC/VAR_SEQ can get
|
|
# mangled with unwanted spaces like:
|
|
# 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
|
|
# We want to check for this case and correct it as it happens.
|
|
try:
|
|
first_seq, second_seq = description.split(" -> ")
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
extra_info = ""
|
|
# we might have more information at the end of the
|
|
# second sequence, which should be in parenthesis
|
|
extra_info_pos = second_seq.find(" (")
|
|
if extra_info_pos != -1:
|
|
extra_info = second_seq[extra_info_pos:]
|
|
second_seq = second_seq[:extra_info_pos]
|
|
# now clean spaces out of the first and second string
|
|
first_seq = first_seq.replace(" ", "")
|
|
second_seq = second_seq.replace(" ", "")
|
|
# reassemble the description
|
|
description = first_seq + " -> " + second_seq + extra_info
|
|
feature.qualifiers["description"] = description
|
|
else: # new-style FT line
|
|
value = line[21:].rstrip()
|
|
match = re.match(r"^/([a-z_]+)=", value)
|
|
if match:
|
|
qualifier_type = match.group(1)
|
|
value = value[len(match.group(0)) :]
|
|
if not value.startswith('"'):
|
|
raise ValueError("Missing starting quote in feature")
|
|
if qualifier_type == "id":
|
|
if not value.endswith('"'):
|
|
raise ValueError("Missing closing quote for id")
|
|
feature.id = value[1:-1]
|
|
else:
|
|
if value.endswith('"'):
|
|
value = value[1:-1]
|
|
else: # continues on the next line
|
|
value = value[1:]
|
|
if qualifier_type in feature.qualifiers:
|
|
raise ValueError(
|
|
f"Feature qualifier {qualifier_type!r} already exists for feature"
|
|
)
|
|
feature.qualifiers[qualifier_type] = value
|
|
return
|
|
# this line is a continuation of the description of the previous feature
|
|
keys = list(feature.qualifiers.keys())
|
|
key = keys[-1]
|
|
description = value.rstrip('"')
|
|
old_description = feature.qualifiers[key]
|
|
if key == "evidence" or old_description.endswith("-"):
|
|
description = f"{old_description}{description}"
|
|
else:
|
|
description = f"{old_description} {description}"
|
|
if feature.type == "VAR_SEQ": # see VARSPLIC above
|
|
try:
|
|
first_seq, second_seq = description.split(" -> ")
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
extra_info = ""
|
|
# we might have more information at the end of the
|
|
# second sequence, which should be in parenthesis
|
|
extra_info_pos = second_seq.find(" (")
|
|
if extra_info_pos != -1:
|
|
extra_info = second_seq[extra_info_pos:]
|
|
second_seq = second_seq[:extra_info_pos]
|
|
# now clean spaces out of the first and second string
|
|
first_seq = first_seq.replace(" ", "")
|
|
second_seq = second_seq.replace(" ", "")
|
|
# reassemble the description
|
|
description = first_seq + " -> " + second_seq + extra_info
|
|
feature.qualifiers[key] = description
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from Bio._utils import run_doctest
|
|
|
|
run_doctest(verbose=0)
|