add KEGG Gene parser

Squashed commit of pull request #1238
This commit is contained in:
Kozo Nishida
2017-07-28 01:18:03 +09:00
committed by Peter Cock
parent 613a2b285a
commit 5df03df8f0
4 changed files with 229 additions and 0 deletions

147
Bio/KEGG/Gene/__init__.py Normal file
View File

@ -0,0 +1,147 @@
# Copyright 2017 by Kozo Nishida. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Code to work with the KEGG Gene database.
Functions:
parse - Returns an iterator giving Record objects.
Classes:
Record - A representation of a KEGG Gene.
"""
# other Biopython stuff
from __future__ import print_function
from Bio.KEGG import _write_kegg
from Bio.KEGG import _wrap_kegg
# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
name_wrap = [0, "",
(" ", "$", 1, 1),
("-", "$", 1, 1)]
id_wrap = lambda indent: [indent, "", (" ", "", 1, 0)]
class Record(object):
"""Holds info from a KEGG Gene record.
Attributes:
- entry The entry identifier.
- name A list of the gene names.
- definition The definition for the gene.
- orthology A list of 2-tuples: (orthology id, role)
- organism A tuple: (organism id, organism)
- position The position for the gene
- motif A list of 2-tuples: (database, list of link ids)
- dblinks A list of 2-tuples: (database, list of link ids)
"""
def __init__(self):
"""Initialize new record."""
self.entry = ""
self.name = []
self.definition = ""
self.orthology = []
self.organism = ""
self.position = ""
self.motif = []
self.dblinks = []
def __str__(self):
"""Return a string representation of this Record."""
return self._entry() + \
self._name() + \
self._dblinks() + \
"///"
def _entry(self):
return _write_kegg("ENTRY",
[self.entry])
def _name(self):
return _write_kegg("NAME",
[_wrap_kegg(l, wrap_rule=name_wrap)
for l in self.name])
def _definition(self):
return _write_kegg("DEFINITION",
[self.definition])
def _dblinks(self):
s = []
for entry in self.dblinks:
s.append(entry[0] + ": " + " ".join(entry[1]))
return _write_kegg("DBLINKS",
[_wrap_kegg(l, wrap_rule=id_wrap(9))
for l in s])
def parse(handle):
"""Parse a KEGG Gene file, returning Record objects.
This is an iterator function, typically used in a for loop. For
example, using one of the example KEGG files in the Biopython
test suite,
>>> with open("KEGG/gene.sample") as handle:
... for record in parse(handle):
... print("%s %s" % (record.entry, record.name[0]))
...
b1174 minE
b1175 minD
"""
record = Record()
for line in handle:
if line[:3] == "///":
yield record
record = Record()
continue
if line[:12] != " ":
keyword = line[:12]
data = line[12:].strip()
if keyword == "ENTRY ":
words = data.split()
record.entry = words[0]
elif keyword == "NAME ":
data = data.strip(";")
record.name.append(data)
elif keyword == "DEFINITION ":
record.definition = data
elif keyword == "ORTHOLOGY ":
id, name = data.split(" ")
orthology = (id, name)
record.orthology.append(orthology)
elif keyword == "ORGANISM ":
id, name = data.split(" ")
organism = (id, name)
record.organism = organism
elif keyword == "POSITION ":
record.position = data
elif keyword == "MOTIF ":
key, values = data.split(": ")
values = values.split()
row = (key, values)
record.motif.append(row)
elif keyword == "DBLINKS ":
if ":" in data:
key, values = data.split(": ")
values = values.split()
row = (key, values)
record.dblinks.append(row)
else:
row = record.dblinks[-1]
key, values = row
values.extend(data.split())
row = key, values
record.dblinks[-1] = row
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

80
Tests/KEGG/gene.sample Normal file
View File

@ -0,0 +1,80 @@
ENTRY b1174 CDS T00007
NAME minE
DEFINITION (RefSeq) cell division topological specificity factor
ORTHOLOGY K03608 cell division topological specificity factor
ORGANISM eco Escherichia coli K-12 MG1655
BRITE Chromosome [BR:eco03036]
Prokaryotic Type
Chromosome partitioning proteins
Inhibitors of FtsZ assembly
b1174 (minE)
Cytoskeleton proteins [BR:eco04812]
Prokaryotic cytoskeleton proteins
MinD / ParA class of bacterial cytoskeletal proteins
b1174 (minE)
POSITION complement(1224279..1224545)
MOTIF Pfam: MinE DUF4364
DBLINKS NCBI-ProteinID: NP_415692
NCBI-GeneID: 945740
Pasteur: minE
RegulonDB: ECK120000591
ECOCYC: EG10598
ASAP: ABE-0003935
UniProt: P0A734
STRUCTURE PDB: 3R9I 3R9J 1EV0
AASEQ 88
MALLDFFLSRKKNTANIAKERLQIIVAERRRSDAEPHYLPQLRKDILEVICKYVQIDPEM
VTVQLEQKDGDISILELNVTLPEAEELK
NTSEQ 267
atggcattactcgatttctttctctcgcggaagaaaaacacagccaacattgcaaaagaa
cggctgcagattattgttgctgaacgccgtcgcagcgatgcagaaccgcattatctgccg
cagttgcgtaaagatattcttgaggtcatttgtaaatacgtacaaattgatcctgagatg
gtaaccgtacagcttgagcaaaaagatggcgatatttctattcttgagctgaacgtgacc
ttaccggaagcagaagagctgaaataa
///
ENTRY b1175 CDS T00007
NAME minD
DEFINITION (RefSeq) inhibitor of FtsZ ring polymerization; chromosome-membrane tethering protein; membrane ATPase of the MinCDEE system
ORTHOLOGY K03609 septum site-determining protein MinD
ORGANISM eco Escherichia coli K-12 MG1655
BRITE Chromosome [BR:eco03036]
Prokaryotic Type
Chromosome partitioning proteins
Inhibitors of FtsZ assembly
b1175 (minD)
Cytoskeleton proteins [BR:eco04812]
Prokaryotic cytoskeleton proteins
MinD / ParA class of bacterial cytoskeletal proteins
b1175 (minD)
POSITION complement(1224549..1225361)
MOTIF Pfam: CbiA AAA_31 ParA ArsA_ATPase MipZ CBP_BcsQ Fer4_NifH AAA_24 CLP1_P SRP54 VirC1 DUF87
DBLINKS NCBI-ProteinID: NP_415693
NCBI-GeneID: 945741
Pasteur: minD
RegulonDB: ECK120000590
ECOCYC: EG10597
ASAP: ABE-0003938
UniProt: P0AEZ3
STRUCTURE PDB: 3Q9L 3R9I 3R9J
AASEQ 270
MARIIVVTSGKGGVGKTTSSAAIATGLAQKGKKTVVIDFDIGLRNLDLIMGCERRVVYDF
VNVIQGDATLNQALIKDKRTENLYILPASQTRDKDALTREGVAKVLDDLKAMDFEFIVCD
SPAGIETGALMALYFADEAIITTNPEVSSVRDSDRILGILASKSRRAENGEEPIKEHLLL
TRYNPGRVSRGDMLSMEDVLEILRIKLVGVIPEDQSVLRASNQGEPVILDINADAGKAYA
DTVERLLGEERPFRFIEEEKKGFLKRLFGG
NTSEQ 813
atggcacgcattattgttgttacttcgggcaaagggggtgttggtaagacaacctccagc
gcggccatcgccactggtttggcccagaagggaaagaaaactgtcgtgatagattttgat
atcggcctgcgtaatctcgacctgattatgggttgtgaacgccgggtcgtttacgatttc
gtcaacgtcattcagggcgatgcaacgctaaatcaggcgttaattaaagataagcgtact
gaaaatctctatattctgccggcatcgcaaacacgcgataaagatgccctcacccgtgaa
ggggtcgccaaagttcttgatgatctgaaagcgatggattttgaatttatcgtttgtgac
tccccggcagggattgaaaccggtgcgttaatggcactctattttgcagacgaagccatt
attaccaccaacccggaagtctcctcagtacgcgactctgaccgtattttaggcattctg
gcgtcgaaatcacgccgcgcagaaaatggcgaagagcctattaaagagcacctgctgtta
acgcgctataacccaggccgcgtaagcagaggtgacatgctgagcatggaagatgtgctg
gagatcctgcgcatcaaactcgtcggcgtgatcccagaggatcaatcagtattgcgcgcc
tctaaccagggtgaaccggtcattctcgacattaacgccgatgcgggtaaagcctacgca
gataccgtagaacgtctgttgggagaagaacgtcctttccgcttcattgaagaagagaag
aaaggcttcctcaaacgcttgttcggaggataa
///

View File

@ -102,6 +102,7 @@ DOCTEST_MODULES = [
"Bio.Graphics.GenomeDiagram._Colors",
"Bio.KEGG.Compound",
"Bio.KEGG.Enzyme",
"Bio.KEGG.Gene",
"Bio.KEGG.KGML.KGML_parser",
"Bio.NMR.xpktools",
"Bio.motifs",

View File

@ -295,6 +295,7 @@ PACKAGES = [
'Bio.KEGG',
'Bio.KEGG.Compound',
'Bio.KEGG.Enzyme',
'Bio.KEGG.Gene',
'Bio.KEGG.Map',
'Bio.PDB.mmtf',
'Bio.KEGG.KGML',