add KEGG Gene parser

Squashed commit of pull request #1238
2025-10-20 13:43:47 +08:00 · 2017-07-28 01:18:03 +09:00
parent 613a2b285a
commit 5df03df8f0
4 changed files with 229 additions and 0 deletions
--- a/Bio/KEGG/Gene/init.py
+++ b/Bio/KEGG/Gene/init.py
@ -0,0 +1,147 @@
+# Copyright 2017 by Kozo Nishida.  All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code to work with the KEGG Gene database.
+
+Functions:
+parse - Returns an iterator giving Record objects.
+
+Classes:
+Record - A representation of a KEGG Gene.
+"""
+
+# other Biopython stuff
+from __future__ import print_function
+
+from Bio.KEGG import _write_kegg
+from Bio.KEGG import _wrap_kegg
+
+# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
+name_wrap = [0, "",
+             (" ", "$", 1, 1),
+             ("-", "$", 1, 1)]
+id_wrap = lambda indent: [indent, "", (" ", "", 1, 0)]
+
+
+class Record(object):
+    """Holds info from a KEGG Gene record.
+
+    Attributes:
+     - entry       The entry identifier.
+     - name        A list of the gene names.
+     - definition  The definition for the gene.
+     - orthology   A list of 2-tuples: (orthology id, role)
+     - organism    A tuple: (organism id, organism)
+     - position    The position for the gene
+     - motif       A list of 2-tuples: (database, list of link ids)
+     - dblinks     A list of 2-tuples: (database, list of link ids)
+
+    """
+
+    def __init__(self):
+        """Initialize new record."""
+        self.entry = ""
+        self.name = []
+        self.definition = ""
+        self.orthology = []
+        self.organism = ""
+        self.position = ""
+        self.motif = []
+        self.dblinks = []
+
+    def __str__(self):
+        """Return a string representation of this Record."""
+        return self._entry() + \
+               self._name() + \
+               self._dblinks() + \
+               "///"
+
+    def _entry(self):
+        return _write_kegg("ENTRY",
+                           [self.entry])
+
+    def _name(self):
+        return _write_kegg("NAME",
+                           [_wrap_kegg(l, wrap_rule=name_wrap)
+                            for l in self.name])
+
+    def _definition(self):
+        return _write_kegg("DEFINITION",
+                           [self.definition])
+
+    def _dblinks(self):
+        s = []
+        for entry in self.dblinks:
+            s.append(entry[0] + ": " + " ".join(entry[1]))
+        return _write_kegg("DBLINKS",
+                           [_wrap_kegg(l, wrap_rule=id_wrap(9))
+                            for l in s])
+
+
+def parse(handle):
+    """Parse a KEGG Gene file, returning Record objects.
+
+    This is an iterator function, typically used in a for loop.  For
+    example, using one of the example KEGG files in the Biopython
+    test suite,
+
+    >>> with open("KEGG/gene.sample") as handle:
+    ...     for record in parse(handle):
+    ...         print("%s %s" % (record.entry, record.name[0]))
+    ...
+    b1174 minE
+    b1175 minD
+
+
+    """
+    record = Record()
+    for line in handle:
+        if line[:3] == "///":
+            yield record
+            record = Record()
+            continue
+        if line[:12] != "            ":
+            keyword = line[:12]
+        data = line[12:].strip()
+        if keyword == "ENTRY       ":
+            words = data.split()
+            record.entry = words[0]
+        elif keyword == "NAME        ":
+            data = data.strip(";")
+            record.name.append(data)
+        elif keyword == "DEFINITION  ":
+            record.definition = data
+        elif keyword == "ORTHOLOGY   ":
+            id, name = data.split("  ")
+            orthology = (id, name)
+            record.orthology.append(orthology)
+        elif keyword == "ORGANISM    ":
+            id, name = data.split("  ")
+            organism = (id, name)
+            record.organism = organism
+        elif keyword == "POSITION    ":
+            record.position = data
+        elif keyword == "MOTIF       ":
+            key, values = data.split(": ")
+            values = values.split()
+            row = (key, values)
+            record.motif.append(row)
+        elif keyword == "DBLINKS     ":
+            if ":" in data:
+                key, values = data.split(": ")
+                values = values.split()
+                row = (key, values)
+                record.dblinks.append(row)
+            else:
+                row = record.dblinks[-1]
+                key, values = row
+                values.extend(data.split())
+                row = key, values
+                record.dblinks[-1] = row
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+    run_doctest()
--- a/Tests/KEGG/gene.sample
+++ b/Tests/KEGG/gene.sample
@ -0,0 +1,80 @@
+ENTRY       b1174             CDS       T00007
+NAME        minE
+DEFINITION  (RefSeq) cell division topological specificity factor
+ORTHOLOGY   K03608  cell division topological specificity factor
+ORGANISM    eco  Escherichia coli K-12 MG1655
+BRITE       Chromosome [BR:eco03036]
+             Prokaryotic Type
+              Chromosome partitioning proteins
+               Inhibitors of FtsZ assembly
+                b1174 (minE)
+            Cytoskeleton proteins [BR:eco04812]
+             Prokaryotic cytoskeleton proteins
+              MinD / ParA class of bacterial cytoskeletal proteins
+               b1174 (minE)
+POSITION    complement(1224279..1224545)
+MOTIF       Pfam: MinE DUF4364
+DBLINKS     NCBI-ProteinID: NP_415692
+            NCBI-GeneID: 945740
+            Pasteur: minE
+            RegulonDB: ECK120000591
+            ECOCYC: EG10598
+            ASAP: ABE-0003935
+            UniProt: P0A734
+STRUCTURE   PDB: 3R9I 3R9J 1EV0
+AASEQ       88
+            MALLDFFLSRKKNTANIAKERLQIIVAERRRSDAEPHYLPQLRKDILEVICKYVQIDPEM
+            VTVQLEQKDGDISILELNVTLPEAEELK
+NTSEQ       267
+            atggcattactcgatttctttctctcgcggaagaaaaacacagccaacattgcaaaagaa
+            cggctgcagattattgttgctgaacgccgtcgcagcgatgcagaaccgcattatctgccg
+            cagttgcgtaaagatattcttgaggtcatttgtaaatacgtacaaattgatcctgagatg
+            gtaaccgtacagcttgagcaaaaagatggcgatatttctattcttgagctgaacgtgacc
+            ttaccggaagcagaagagctgaaataa
+///
+ENTRY       b1175             CDS       T00007
+NAME        minD
+DEFINITION  (RefSeq) inhibitor of FtsZ ring polymerization; chromosome-membrane tethering protein; membrane ATPase of the MinCDEE system
+ORTHOLOGY   K03609  septum site-determining protein MinD
+ORGANISM    eco  Escherichia coli K-12 MG1655
+BRITE       Chromosome [BR:eco03036]
+             Prokaryotic Type
+              Chromosome partitioning proteins
+               Inhibitors of FtsZ assembly
+                b1175 (minD)
+            Cytoskeleton proteins [BR:eco04812]
+             Prokaryotic cytoskeleton proteins
+              MinD / ParA class of bacterial cytoskeletal proteins
+               b1175 (minD)
+POSITION    complement(1224549..1225361)
+MOTIF       Pfam: CbiA AAA_31 ParA ArsA_ATPase MipZ CBP_BcsQ Fer4_NifH AAA_24 CLP1_P SRP54 VirC1 DUF87
+DBLINKS     NCBI-ProteinID: NP_415693
+            NCBI-GeneID: 945741
+            Pasteur: minD
+            RegulonDB: ECK120000590
+            ECOCYC: EG10597
+            ASAP: ABE-0003938
+            UniProt: P0AEZ3
+STRUCTURE   PDB: 3Q9L 3R9I 3R9J
+AASEQ       270
+            MARIIVVTSGKGGVGKTTSSAAIATGLAQKGKKTVVIDFDIGLRNLDLIMGCERRVVYDF
+            VNVIQGDATLNQALIKDKRTENLYILPASQTRDKDALTREGVAKVLDDLKAMDFEFIVCD
+            SPAGIETGALMALYFADEAIITTNPEVSSVRDSDRILGILASKSRRAENGEEPIKEHLLL
+            TRYNPGRVSRGDMLSMEDVLEILRIKLVGVIPEDQSVLRASNQGEPVILDINADAGKAYA
+            DTVERLLGEERPFRFIEEEKKGFLKRLFGG
+NTSEQ       813
+            atggcacgcattattgttgttacttcgggcaaagggggtgttggtaagacaacctccagc
+            gcggccatcgccactggtttggcccagaagggaaagaaaactgtcgtgatagattttgat
+            atcggcctgcgtaatctcgacctgattatgggttgtgaacgccgggtcgtttacgatttc
+            gtcaacgtcattcagggcgatgcaacgctaaatcaggcgttaattaaagataagcgtact
+            gaaaatctctatattctgccggcatcgcaaacacgcgataaagatgccctcacccgtgaa
+            ggggtcgccaaagttcttgatgatctgaaagcgatggattttgaatttatcgtttgtgac
+            tccccggcagggattgaaaccggtgcgttaatggcactctattttgcagacgaagccatt
+            attaccaccaacccggaagtctcctcagtacgcgactctgaccgtattttaggcattctg
+            gcgtcgaaatcacgccgcgcagaaaatggcgaagagcctattaaagagcacctgctgtta
+            acgcgctataacccaggccgcgtaagcagaggtgacatgctgagcatggaagatgtgctg
+            gagatcctgcgcatcaaactcgtcggcgtgatcccagaggatcaatcagtattgcgcgcc
+            tctaaccagggtgaaccggtcattctcgacattaacgccgatgcgggtaaagcctacgca
+            gataccgtagaacgtctgttgggagaagaacgtcctttccgcttcattgaagaagagaag
+            aaaggcttcctcaaacgcttgttcggaggataa
+///
--- a/Tests/run_tests.py
+++ b/Tests/run_tests.py
@ -102,6 +102,7 @@ DOCTEST_MODULES = [
    "Bio.Graphics.GenomeDiagram._Colors",
    "Bio.KEGG.Compound",
    "Bio.KEGG.Enzyme",
+    "Bio.KEGG.Gene",
    "Bio.KEGG.KGML.KGML_parser",
    "Bio.NMR.xpktools",
    "Bio.motifs",
--- a/setup.py
+++ b/setup.py
@ -295,6 +295,7 @@ PACKAGES = [
    'Bio.KEGG',
    'Bio.KEGG.Compound',
    'Bio.KEGG.Enzyme',
+    'Bio.KEGG.Gene',
    'Bio.KEGG.Map',
    'Bio.PDB.mmtf',
    'Bio.KEGG.KGML',