biopython/Bio/KEGG/Enzyme/__init__.py

# Copyright 2001 by Tarjei Mikkelsen.  All rights reserved.
# Copyright 2007 by Michiel de Hoon.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Code to work with the KEGG Enzyme database.

Functions:
 - parse - Returns an iterator giving Record objects.

Classes:
 - Record - Holds the information from a KEGG Enzyme record.
"""

from Bio.KEGG import _default_wrap
from Bio.KEGG import _struct_wrap
from Bio.KEGG import _wrap_kegg
from Bio.KEGG import _write_kegg

# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
rxn_wrap = [
    0,
    "",
    (" + ", "", 1, 1),
    (" = ", "", 1, 1),
    (" ", "$", 1, 1),
    ("-", "$", 1, 1),
]
name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)]
id_wrap = _default_wrap
struct_wrap = _struct_wrap


class Record:
    """Holds info from a KEGG Enzyme record.

    Attributes:
     - entry       The EC number (without the 'EC ').
     - name        A list of the enzyme names.
     - classname   A list of the classification terms.
     - sysname     The systematic name of the enzyme.
     - reaction    A list of the reaction description strings.
     - substrate   A list of the substrates.
     - product     A list of the products.
     - inhibitor   A list of the inhibitors.
     - cofactor    A list of the cofactors.
     - effector    A list of the effectors.
     - comment     A list of the comment strings.
     - pathway     A list of 3-tuples: (database, id, pathway)
     - genes       A list of 2-tuples: (organism, list of gene ids)
     - disease     A list of 3-tuples: (database, id, disease)
     - structures  A list of 2-tuples: (database, list of struct ids)
     - dblinks     A list of 2-tuples: (database, list of db ids)

    """

    def __init__(self):
        """Initialize a new Record."""
        self.entry = ""
        self.name = []
        self.classname = []
        self.sysname = []
        self.reaction = []
        self.substrate = []
        self.product = []
        self.inhibitor = []
        self.cofactor = []
        self.effector = []
        self.comment = []
        self.pathway = []
        self.genes = []
        self.disease = []
        self.structures = []
        self.dblinks = []

    def __str__(self):
        """Return a string representation of this Record."""
        return (
            self._entry()
            + self._name()
            + self._classname()
            + self._sysname()
            + self._reaction()
            + self._substrate()
            + self._product()
            + self._inhibitor()
            + self._cofactor()
            + self._effector()
            + self._comment()
            + self._pathway()
            + self._genes()
            + self._disease()
            + self._structures()
            + self._dblinks()
            + "///"
        )

    def _entry(self):
        return _write_kegg("ENTRY", ["EC " + self.entry])

    def _name(self):
        return _write_kegg(
            "NAME", [_wrap_kegg(line, wrap_rule=name_wrap) for line in self.name]
        )

    def _classname(self):
        return _write_kegg("CLASS", self.classname)

    def _sysname(self):
        return _write_kegg(
            "SYSNAME", [_wrap_kegg(line, wrap_rule=name_wrap) for line in self.sysname]
        )

    def _reaction(self):
        return _write_kegg(
            "REACTION", [_wrap_kegg(line, wrap_rule=rxn_wrap) for line in self.reaction]
        )

    def _substrate(self):
        return _write_kegg(
            "SUBSTRATE",
            [_wrap_kegg(line, wrap_rule=name_wrap) for line in self.substrate],
        )

    def _product(self):
        return _write_kegg(
            "PRODUCT", [_wrap_kegg(line, wrap_rule=name_wrap) for line in self.product]
        )

    def _inhibitor(self):
        return _write_kegg(
            "INHIBITOR",
            [_wrap_kegg(line, wrap_rule=name_wrap) for line in self.inhibitor],
        )

    def _cofactor(self):
        return _write_kegg(
            "COFACTOR",
            [_wrap_kegg(line, wrap_rule=name_wrap) for line in self.cofactor],
        )

    def _effector(self):
        return _write_kegg(
            "EFFECTOR",
            [_wrap_kegg(line, wrap_rule=name_wrap) for line in self.effector],
        )

    def _comment(self):
        return _write_kegg(
            "COMMENT", [_wrap_kegg(line, wrap_rule=id_wrap(0)) for line in self.comment]
        )

    def _pathway(self):
        s = []
        for entry in self.pathway:
            s.append(entry[0] + ": " + entry[1] + "  " + entry[2])
        return _write_kegg(
            "PATHWAY", [_wrap_kegg(line, wrap_rule=id_wrap(16)) for line in s]
        )

    def _genes(self):
        s = []
        for entry in self.genes:
            s.append(entry[0] + ": " + " ".join(entry[1]))
        return _write_kegg(
            "GENES", [_wrap_kegg(line, wrap_rule=id_wrap(5)) for line in s]
        )

    def _disease(self):
        s = []
        for entry in self.disease:
            s.append(entry[0] + ": " + entry[1] + "  " + entry[2])
        return _write_kegg(
            "DISEASE", [_wrap_kegg(line, wrap_rule=id_wrap(13)) for line in s]
        )

    def _structures(self):
        s = []
        for entry in self.structures:
            s.append(entry[0] + ": " + "  ".join(entry[1]) + "  ")
        return _write_kegg(
            "STRUCTURES", [_wrap_kegg(line, wrap_rule=struct_wrap(5)) for line in s]
        )

    def _dblinks(self):
        # This is a bit of a cheat that won't work if enzyme entries
        # have more than one link id per db id. For now, that's not
        # the case - storing links ids in a list is only to make
        # this class similar to the Compound.Record class.
        s = []
        for entry in self.dblinks:
            s.append(entry[0] + ": " + "  ".join(entry[1]))
        return _write_kegg("DBLINKS", s)


def parse(handle):
    """Parse a KEGG Enzyme file, returning Record objects.

    This is an iterator function, typically used in a for loop.  For
    example, using one of the example KEGG files in the Biopython
    test suite,

    >>> with open("KEGG/enzyme.sample") as handle:
    ...     for record in parse(handle):
    ...         print("%s %s" % (record.entry, record.name[0]))
    ...
    1.1.1.1 alcohol dehydrogenase
    1.1.1.62 17beta-estradiol 17-dehydrogenase
    1.1.1.68 Transferred to 1.5.1.20
    1.6.5.3 NADH:ubiquinone reductase (H+-translocating)
    1.14.13.28 3,9-dihydroxypterocarpan 6a-monooxygenase
    2.4.1.68 glycoprotein 6-alpha-L-fucosyltransferase
    3.1.1.6 acetylesterase
    2.7.2.1 acetate kinase

    """
    record = Record()
    for line in handle:
        if line[:3] == "///":
            yield record
            record = Record()
            continue
        if line[:12] != "            ":
            keyword = line[:12]
        data = line[12:].strip()
        if keyword == "ENTRY       ":
            words = data.split()
            record.entry = words[1]
        elif keyword == "CLASS       ":
            record.classname.append(data)
        elif keyword == "COFACTOR    ":
            record.cofactor.append(data)
        elif keyword == "COMMENT     ":
            record.comment.append(data)
        elif keyword == "DBLINKS     ":
            if ":" in data:
                key, values = data.split(":")
                values = values.split()
                row = (key, values)
                record.dblinks.append(row)
            else:
                row = record.dblinks[-1]
                key, values = row
                values.extend(data.split())
                row = key, values
                record.dblinks[-1] = row
        elif keyword == "DISEASE     ":
            if ":" in data:
                database, data = data.split(":")
                number, name = data.split(None, 1)
                row = (database, number, name)
                record.disease.append(row)
            else:
                row = record.disease[-1]
                database, number, name = row
                name = name + " " + data
                row = database, number, name
                record.disease[-1] = row
        elif keyword == "EFFECTOR    ":
            record.effector.append(data.strip(";"))
        elif keyword == "GENES       ":
            if data[3:5] == ": " or data[4:6] == ": ":
                key, values = data.split(":", 1)
                values = [value.split("(")[0] for value in values.split()]
                row = (key, values)
                record.genes.append(row)
            else:
                row = record.genes[-1]
                key, values = row
                for value in data.split():
                    value = value.split("(")[0]
                    values.append(value)
                row = key, values
                record.genes[-1] = row
        elif keyword == "INHIBITOR   ":
            record.inhibitor.append(data.strip(";"))
        elif keyword == "NAME        ":
            record.name.append(data.strip(";"))
        elif keyword == "PATHWAY     ":
            if data[:5] == "PATH:":
                _, map_num, name = data.split(None, 2)
                pathway = ("PATH", map_num, name)
                record.pathway.append(pathway)
            else:
                ec_num, name = data.split(None, 1)
                pathway = "PATH", ec_num, name
                record.pathway.append(pathway)
        elif keyword == "PRODUCT     ":
            record.product.append(data.strip(";"))
        elif keyword == "REACTION    ":
            record.reaction.append(data.strip(";"))
        elif keyword == "STRUCTURES  ":
            if data[:4] == "PDB:":
                database = data[:3]
                accessions = data[4:].split()
                row = (database, accessions)
                record.structures.append(row)
            else:
                row = record.structures[-1]
                database, accessions = row
                accessions.extend(data.split())
                row = (database, accessions)
                record.structures[-1] = row
        elif keyword == "SUBSTRATE   ":
            record.substrate.append(data.strip(";"))
        elif keyword == "SYSNAME     ":
            record.sysname.append(data.strip(";"))


def read(handle):
    """Parse a KEGG Enzyme file with exactly one entry.

    If the handle contains no records, or more than one record,
    an exception is raised.  For example:

    >>> with open("KEGG/enzyme.new") as handle:
    ...     record = read(handle)
    ...     print("%s %s" % (record.entry, record.name[0]))
    ...
    6.2.1.25 benzoate---CoA ligase
    """
    records = parse(handle)
    try:
        record = next(records)
    except StopIteration:
        raise ValueError("No records found in handle") from None
    try:
        next(records)
        raise ValueError("More than one record found in handle")
    except StopIteration:
        pass
    return record


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()