mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
147
Bio/KEGG/Gene/__init__.py
Normal file
147
Bio/KEGG/Gene/__init__.py
Normal file
@ -0,0 +1,147 @@
|
||||
# Copyright 2017 by Kozo Nishida. All rights reserved.
|
||||
# This code is part of the Biopython distribution and governed by its
|
||||
# license. Please see the LICENSE file that should have been included
|
||||
# as part of this package.
|
||||
|
||||
"""Code to work with the KEGG Gene database.
|
||||
|
||||
Functions:
|
||||
parse - Returns an iterator giving Record objects.
|
||||
|
||||
Classes:
|
||||
Record - A representation of a KEGG Gene.
|
||||
"""
|
||||
|
||||
# other Biopython stuff
|
||||
from __future__ import print_function
|
||||
|
||||
from Bio.KEGG import _write_kegg
|
||||
from Bio.KEGG import _wrap_kegg
|
||||
|
||||
# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
|
||||
name_wrap = [0, "",
|
||||
(" ", "$", 1, 1),
|
||||
("-", "$", 1, 1)]
|
||||
id_wrap = lambda indent: [indent, "", (" ", "", 1, 0)]
|
||||
|
||||
|
||||
class Record(object):
|
||||
"""Holds info from a KEGG Gene record.
|
||||
|
||||
Attributes:
|
||||
- entry The entry identifier.
|
||||
- name A list of the gene names.
|
||||
- definition The definition for the gene.
|
||||
- orthology A list of 2-tuples: (orthology id, role)
|
||||
- organism A tuple: (organism id, organism)
|
||||
- position The position for the gene
|
||||
- motif A list of 2-tuples: (database, list of link ids)
|
||||
- dblinks A list of 2-tuples: (database, list of link ids)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize new record."""
|
||||
self.entry = ""
|
||||
self.name = []
|
||||
self.definition = ""
|
||||
self.orthology = []
|
||||
self.organism = ""
|
||||
self.position = ""
|
||||
self.motif = []
|
||||
self.dblinks = []
|
||||
|
||||
def __str__(self):
|
||||
"""Return a string representation of this Record."""
|
||||
return self._entry() + \
|
||||
self._name() + \
|
||||
self._dblinks() + \
|
||||
"///"
|
||||
|
||||
def _entry(self):
|
||||
return _write_kegg("ENTRY",
|
||||
[self.entry])
|
||||
|
||||
def _name(self):
|
||||
return _write_kegg("NAME",
|
||||
[_wrap_kegg(l, wrap_rule=name_wrap)
|
||||
for l in self.name])
|
||||
|
||||
def _definition(self):
|
||||
return _write_kegg("DEFINITION",
|
||||
[self.definition])
|
||||
|
||||
def _dblinks(self):
|
||||
s = []
|
||||
for entry in self.dblinks:
|
||||
s.append(entry[0] + ": " + " ".join(entry[1]))
|
||||
return _write_kegg("DBLINKS",
|
||||
[_wrap_kegg(l, wrap_rule=id_wrap(9))
|
||||
for l in s])
|
||||
|
||||
|
||||
def parse(handle):
|
||||
"""Parse a KEGG Gene file, returning Record objects.
|
||||
|
||||
This is an iterator function, typically used in a for loop. For
|
||||
example, using one of the example KEGG files in the Biopython
|
||||
test suite,
|
||||
|
||||
>>> with open("KEGG/gene.sample") as handle:
|
||||
... for record in parse(handle):
|
||||
... print("%s %s" % (record.entry, record.name[0]))
|
||||
...
|
||||
b1174 minE
|
||||
b1175 minD
|
||||
|
||||
|
||||
"""
|
||||
record = Record()
|
||||
for line in handle:
|
||||
if line[:3] == "///":
|
||||
yield record
|
||||
record = Record()
|
||||
continue
|
||||
if line[:12] != " ":
|
||||
keyword = line[:12]
|
||||
data = line[12:].strip()
|
||||
if keyword == "ENTRY ":
|
||||
words = data.split()
|
||||
record.entry = words[0]
|
||||
elif keyword == "NAME ":
|
||||
data = data.strip(";")
|
||||
record.name.append(data)
|
||||
elif keyword == "DEFINITION ":
|
||||
record.definition = data
|
||||
elif keyword == "ORTHOLOGY ":
|
||||
id, name = data.split(" ")
|
||||
orthology = (id, name)
|
||||
record.orthology.append(orthology)
|
||||
elif keyword == "ORGANISM ":
|
||||
id, name = data.split(" ")
|
||||
organism = (id, name)
|
||||
record.organism = organism
|
||||
elif keyword == "POSITION ":
|
||||
record.position = data
|
||||
elif keyword == "MOTIF ":
|
||||
key, values = data.split(": ")
|
||||
values = values.split()
|
||||
row = (key, values)
|
||||
record.motif.append(row)
|
||||
elif keyword == "DBLINKS ":
|
||||
if ":" in data:
|
||||
key, values = data.split(": ")
|
||||
values = values.split()
|
||||
row = (key, values)
|
||||
record.dblinks.append(row)
|
||||
else:
|
||||
row = record.dblinks[-1]
|
||||
key, values = row
|
||||
values.extend(data.split())
|
||||
row = key, values
|
||||
record.dblinks[-1] = row
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from Bio._utils import run_doctest
|
||||
run_doctest()
|
80
Tests/KEGG/gene.sample
Normal file
80
Tests/KEGG/gene.sample
Normal file
@ -0,0 +1,80 @@
|
||||
ENTRY b1174 CDS T00007
|
||||
NAME minE
|
||||
DEFINITION (RefSeq) cell division topological specificity factor
|
||||
ORTHOLOGY K03608 cell division topological specificity factor
|
||||
ORGANISM eco Escherichia coli K-12 MG1655
|
||||
BRITE Chromosome [BR:eco03036]
|
||||
Prokaryotic Type
|
||||
Chromosome partitioning proteins
|
||||
Inhibitors of FtsZ assembly
|
||||
b1174 (minE)
|
||||
Cytoskeleton proteins [BR:eco04812]
|
||||
Prokaryotic cytoskeleton proteins
|
||||
MinD / ParA class of bacterial cytoskeletal proteins
|
||||
b1174 (minE)
|
||||
POSITION complement(1224279..1224545)
|
||||
MOTIF Pfam: MinE DUF4364
|
||||
DBLINKS NCBI-ProteinID: NP_415692
|
||||
NCBI-GeneID: 945740
|
||||
Pasteur: minE
|
||||
RegulonDB: ECK120000591
|
||||
ECOCYC: EG10598
|
||||
ASAP: ABE-0003935
|
||||
UniProt: P0A734
|
||||
STRUCTURE PDB: 3R9I 3R9J 1EV0
|
||||
AASEQ 88
|
||||
MALLDFFLSRKKNTANIAKERLQIIVAERRRSDAEPHYLPQLRKDILEVICKYVQIDPEM
|
||||
VTVQLEQKDGDISILELNVTLPEAEELK
|
||||
NTSEQ 267
|
||||
atggcattactcgatttctttctctcgcggaagaaaaacacagccaacattgcaaaagaa
|
||||
cggctgcagattattgttgctgaacgccgtcgcagcgatgcagaaccgcattatctgccg
|
||||
cagttgcgtaaagatattcttgaggtcatttgtaaatacgtacaaattgatcctgagatg
|
||||
gtaaccgtacagcttgagcaaaaagatggcgatatttctattcttgagctgaacgtgacc
|
||||
ttaccggaagcagaagagctgaaataa
|
||||
///
|
||||
ENTRY b1175 CDS T00007
|
||||
NAME minD
|
||||
DEFINITION (RefSeq) inhibitor of FtsZ ring polymerization; chromosome-membrane tethering protein; membrane ATPase of the MinCDEE system
|
||||
ORTHOLOGY K03609 septum site-determining protein MinD
|
||||
ORGANISM eco Escherichia coli K-12 MG1655
|
||||
BRITE Chromosome [BR:eco03036]
|
||||
Prokaryotic Type
|
||||
Chromosome partitioning proteins
|
||||
Inhibitors of FtsZ assembly
|
||||
b1175 (minD)
|
||||
Cytoskeleton proteins [BR:eco04812]
|
||||
Prokaryotic cytoskeleton proteins
|
||||
MinD / ParA class of bacterial cytoskeletal proteins
|
||||
b1175 (minD)
|
||||
POSITION complement(1224549..1225361)
|
||||
MOTIF Pfam: CbiA AAA_31 ParA ArsA_ATPase MipZ CBP_BcsQ Fer4_NifH AAA_24 CLP1_P SRP54 VirC1 DUF87
|
||||
DBLINKS NCBI-ProteinID: NP_415693
|
||||
NCBI-GeneID: 945741
|
||||
Pasteur: minD
|
||||
RegulonDB: ECK120000590
|
||||
ECOCYC: EG10597
|
||||
ASAP: ABE-0003938
|
||||
UniProt: P0AEZ3
|
||||
STRUCTURE PDB: 3Q9L 3R9I 3R9J
|
||||
AASEQ 270
|
||||
MARIIVVTSGKGGVGKTTSSAAIATGLAQKGKKTVVIDFDIGLRNLDLIMGCERRVVYDF
|
||||
VNVIQGDATLNQALIKDKRTENLYILPASQTRDKDALTREGVAKVLDDLKAMDFEFIVCD
|
||||
SPAGIETGALMALYFADEAIITTNPEVSSVRDSDRILGILASKSRRAENGEEPIKEHLLL
|
||||
TRYNPGRVSRGDMLSMEDVLEILRIKLVGVIPEDQSVLRASNQGEPVILDINADAGKAYA
|
||||
DTVERLLGEERPFRFIEEEKKGFLKRLFGG
|
||||
NTSEQ 813
|
||||
atggcacgcattattgttgttacttcgggcaaagggggtgttggtaagacaacctccagc
|
||||
gcggccatcgccactggtttggcccagaagggaaagaaaactgtcgtgatagattttgat
|
||||
atcggcctgcgtaatctcgacctgattatgggttgtgaacgccgggtcgtttacgatttc
|
||||
gtcaacgtcattcagggcgatgcaacgctaaatcaggcgttaattaaagataagcgtact
|
||||
gaaaatctctatattctgccggcatcgcaaacacgcgataaagatgccctcacccgtgaa
|
||||
ggggtcgccaaagttcttgatgatctgaaagcgatggattttgaatttatcgtttgtgac
|
||||
tccccggcagggattgaaaccggtgcgttaatggcactctattttgcagacgaagccatt
|
||||
attaccaccaacccggaagtctcctcagtacgcgactctgaccgtattttaggcattctg
|
||||
gcgtcgaaatcacgccgcgcagaaaatggcgaagagcctattaaagagcacctgctgtta
|
||||
acgcgctataacccaggccgcgtaagcagaggtgacatgctgagcatggaagatgtgctg
|
||||
gagatcctgcgcatcaaactcgtcggcgtgatcccagaggatcaatcagtattgcgcgcc
|
||||
tctaaccagggtgaaccggtcattctcgacattaacgccgatgcgggtaaagcctacgca
|
||||
gataccgtagaacgtctgttgggagaagaacgtcctttccgcttcattgaagaagagaag
|
||||
aaaggcttcctcaaacgcttgttcggaggataa
|
||||
///
|
@ -102,6 +102,7 @@ DOCTEST_MODULES = [
|
||||
"Bio.Graphics.GenomeDiagram._Colors",
|
||||
"Bio.KEGG.Compound",
|
||||
"Bio.KEGG.Enzyme",
|
||||
"Bio.KEGG.Gene",
|
||||
"Bio.KEGG.KGML.KGML_parser",
|
||||
"Bio.NMR.xpktools",
|
||||
"Bio.motifs",
|
||||
|
Reference in New Issue
Block a user