Updating the UniGene parser to be consistent with current UniGene files.

This commit is contained in:
mdehoon
2009-04-24 12:03:45 +00:00
parent 16b50cf438
commit b7084cfd63
6 changed files with 1658 additions and 60 deletions

View File

@ -3,7 +3,7 @@
# license. Please see the LICENSE file that should have been included
# as part of this package.
#
# $Id: __init__.py,v 1.11 2008-07-19 12:43:40 peterc Exp $
# $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $
# Sean Davis <sdavis2 at mail dot nih dot gov>
# National Cancer Institute
# National Institutes of Health
@ -29,20 +29,25 @@ Here is an overview of the flat file format that this parser deals with:
I if templated As are found in genomic sequence or
S if a canonical polyA signal is found on
the genomic sequence
GENE_ID Entrez gene identifier associated with at least one sequence in this cluster;
GENE_ID Entrez gene identifier associated with at least one
sequence in this cluster;
to be used instead of LocusLink.
LOCUSLINK LocusLink identifier associated with at least one sequence in this cluster;
LOCUSLINK LocusLink identifier associated with at least one
sequence in this cluster;
deprecated in favor of GENE_ID
CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping on the arabidopsis genome.
HOMOL Homology;
CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
on the arabidopsis genome.
STS STS
NAME= Name of STS
ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
DSEG= GDB Dsegment number [optional field]
ACC= GenBank/EMBL/DDBJ accession number of STS
[optional field]
UNISTS= identifier in NCBI's UNISTS database
TXMAP Transcript map interval
MARKER= Marker found on at least one sequence in this cluster
MARKER= Marker found on at least one sequence in this
cluster
RHPANEL= Radiation Hybrid panel used to place marker
PROTSIM Protein Similarity data for the sequence with highest-scoring protein similarity in this cluster
PROTSIM Protein Similarity data for the sequence with
highest-scoring protein similarity in this cluster
ORG= Organism
PROTGI= Sequence GI of protein
PROTID= Sequence ID of protein
@ -52,30 +57,276 @@ Here is an overview of the flat file format that this parser deals with:
SEQUENCE Sequence
ACC= GenBank/EMBL/DDBJ accession number of sequence
NID= Unique nucleotide sequence identifier (gi)
PID= Unique protein sequence identifier (used for non-ESTs)
PID= Unique protein sequence identifier (used for
non-ESTs)
CLONE= Clone identifier (used for ESTs only)
END= End (5'/3') of clone insert read (used for ESTs only)
LID= Library ID; see Hs.lib.info for library name and tissue
MGC= 5' CDS-completeness indicator; if present,
the clone associated with this sequence
is believed CDS-complete. A value greater than 511
is the gi of the CDS-complete mRNA matched by the EST,
otherwise the value is an indicator of the reliability
of the test indicating CDS comleteness;
higher values indicate more reliable CDS-completeness predictions.
SEQTYPE= Description of the nucleotide sequence. Possible values are
mRNA, EST and HTC.
TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
PERIPHERAL= Indicator that the sequence is a suboptimal
representative of the gene represented by this cluster.
Peripheral sequences are those that are in a cluster
which represents a spliced gene without sharing a
splice junction with any other sequence. In many
cases, they are unspliced transcripts originating
from the gene.
// End of record
END= End (5'/3') of clone insert read (used for
ESTs only)
LID= Library ID; see Hs.lib.info for library name
and tissue
MGC= 5' CDS-completeness indicator; if present, the
clone associated with this sequence is believed
CDS-complete. A value greater than 511 is the gi
of the CDS-complete mRNA matched by the EST,
otherwise the value is an indicator of the
reliability of the test indicating CDS
completeness; higher values indicate more
reliable CDS-completeness predictions.
SEQTYPE= Description of the nucleotide sequence.
Possible values are mRNA, EST and HTC.
TRACE= The Trace ID of the EST sequence, as provided by
NCBI Trace Archive
"""
class SequenceLine:
"""Store the information for one SEQUENCE line from a Unigene file
Initialize with the text part of the SEQUENCE line, or nothing.
Attributes and descriptions (access as LOWER CASE)
ACC= GenBank/EMBL/DDBJ accession number of sequence
NID= Unique nucleotide sequence identifier (gi)
PID= Unique protein sequence identifier (used for non-ESTs)
CLONE= Clone identifier (used for ESTs only)
END= End (5'/3') of clone insert read (used for ESTs only)
LID= Library ID; see Hs.lib.info for library name and tissue
MGC= 5' CDS-completeness indicator; if present,
the clone associated with this sequence
is believed CDS-complete. A value greater than 511
is the gi of the CDS-complete mRNA matched by the EST,
otherwise the value is an indicator of the reliability
of the test indicating CDS completeness;
higher values indicate more reliable CDS-completeness
predictions.
SEQTYPE= Description of the nucleotide sequence. Possible values
are mRNA, EST and HTC.
TRACE= The Trace ID of the EST sequence, as provided by NCBI
Trace Archive
"""
def __init__(self,text=None):
self.acc = ''
self.nid = ''
self.lid = ''
self.pid = ''
self.clone = ''
self.image = ''
self.is_image = False
self.end = ''
self.mgc = ''
self.seqtype = ''
self.trace = ''
if not text==None:
self.text=text
self._init_from_text(text)
def _init_from_text(self,text):
parts = text.split('; ');
for part in parts:
key, val = part.split("=")
if key=='CLONE':
if val[:5]=='IMAGE':
self.is_image=True
self.image = val[6:]
setattr(self,key.lower(),val)
def __repr__(self):
return self.text
class ProtsimLine:
"""Store the information for one PROTSIM line from a Unigene file
Initialize with the text part of the PROTSIM line, or nothing.
Attributes and descriptions (access as LOWER CASE)
ORG= Organism
PROTGI= Sequence GI of protein
PROTID= Sequence ID of protein
PCT= Percent alignment
ALN= length of aligned region (aa)
"""
def __init__(self,text=None):
self.org = ''
self.protgi = ''
self.protid = ''
self.pct = ''
self.aln = ''
if not text==None:
self.text=text
self._init_from_text(text)
def _init_from_text(self,text):
parts = text.split('; ');
for part in parts:
key, val = part.split("=")
setattr(self,key.lower(),val)
def __repr__(self):
return self.text
class STSLine:
"""Store the information for one STS line from a Unigene file
Initialize with the text part of the STS line, or nothing.
Attributes and descriptions (access as LOWER CASE)
ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
UNISTS= identifier in NCBI's UNISTS database
"""
def __init__(self,text=None):
self.acc = ''
self.unists = ''
if not text==None:
self.text=text
self._init_from_text(text)
def _init_from_text(self,text):
parts = text.split(' ');
for part in parts:
key, val = part.split("=")
setattr(self,key.lower(),val)
def __repr__(self):
return self.text
class Record:
"""Store a Unigene record
Here is what is stored:
self.ID = '' # ID line
self.species = '' # Hs, Bt, etc.
self.title = '' # TITLE line
self.symbol = '' # GENE line
self.cytoband = '' # CYTOBAND line
self.express = [] # EXPRESS line, parsed on ';'
# Will be an array of strings
self.restr_expr = '' # RESTR_EXPR line
self.gnm_terminus = '' # GNM_TERMINUS line
self.gene_id = '' # GENE_ID line
self.locuslink = '' # LOCUSLINK line
self.homol = '' # HOMOL line
self.chromosome = '' # CHROMOSOME line
self.protsim = [] # PROTSIM entries, array of Protsims
# Type ProtsimLine
self.sequence = [] # SEQUENCE entries, array of Sequence entries
# Type SequenceLine
self.sts = [] # STS entries, array of STS entries
# Type STSLine
self.txmap = [] # TXMAP entries, array of TXMap entries
"""
def __init__(self):
self.ID = '' # ID line
self.species = '' # Hs, Bt, etc.
self.title = '' # TITLE line
self.symbol = '' # GENE line
self.cytoband = '' # CYTOBAND line
self.express = [] # EXPRESS line, parsed on ';'
self.restr_expr = '' # RESTR_EXPR line
self.gnm_terminus = '' # GNM_TERMINUS line
self.gene_id = '' # GENE_ID line
self.locuslink = '' # LOCUSLINK line
self.homol = '' # HOMOL line
self.chromosome = '' # CHROMOSOME line
self.protsim = [] # PROTSIM entries, array of Protsims
self.sequence = [] # SEQUENCE entries, array of Sequence entries
self.sts = [] # STS entries, array of STS entries
self.txmap = [] # TXMAP entries, array of TXMap entries
def __repr__(self):
return "<%s> %s %s\n%s" % (self.__class__.__name__,
self.ID, self.symbol, self.title)
def parse(handle):
while True:
record = _read(handle)
if not record:
return
yield record
def read(handle):
record = _read(handle)
if not record:
raise ValueError("No SwissProt record found")
# We should have reached the end of the record by now
remainder = handle.read()
if remainder:
raise ValueError("More than one SwissProt record found")
return record
# Everything below is private
def _read(handle):
UG_INDENT = 12
record = None
for line in handle:
tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
line = line.rstrip()
if tag=="ID":
record = Record()
record.ID = value
record.species = record.ID.split('.')[0]
elif tag=="TITLE":
record.title = value
elif tag=="GENE":
record.symbol = value
elif tag=="GENE_ID":
record.gene_id = value
elif tag=="LOCUSLINK":
record.locuslink = value
elif tag=="HOMOL":
if value=="YES":
record.homol = True
elif value=="NO":
record.homol = True
else:
raise ValueError, "Cannot parse HOMOL line %s" % line
elif tag=="EXPRESS":
record.express = [word.strip() for word in value.split("|")]
elif tag=="RESTR_EXPR":
record.restr_expr = [word.strip() for word in value.split("|")]
elif tag=="CHROMOSOME":
record.chromosome = value
elif tag=="CYTOBAND":
record.cytoband = value
elif tag=="PROTSIM":
protsim = ProtsimLine(value)
record.protsim.append(protsim)
elif tag=="SCOUNT":
scount = int(value)
elif tag=="SEQUENCE":
sequence = SequenceLine(value)
record.sequence.append(sequence)
elif tag=="STS":
sts = STSLine(value)
record.sts.append(sts)
elif tag=='//':
if len(record.sequence)!=scount:
raise ValueError, "The number of sequences specified in the record (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence))
return record
else:
raise ValueError, "Unknown tag %s" % tag
if record:
raise ValueError("Unexpected end of stream.")
# Everything below is considered obsolete
from Bio.ParserSupport import *
import re

View File

@ -0,0 +1,149 @@
ID Eca.1
TITLE Ribosomal protein L3
GENE RPL3
GENE_ID 100070291
LOCUSLINK 100070291
HOMOL YES
EXPRESS blood| cartilage| trophoblast|adult
RESTR_EXPR blood|adult
PROTSIM ORG=10090; PROTGI=149270989; PROTID=XP_001477314.1; PCT=99.26; ALN=401
PROTSIM ORG=5207; PROTGI=134117057; PROTID=XP_772755.1; PCT=70.91; ALN=384
PROTSIM ORG=44689; PROTGI=66802278; PROTID=XP_629921.1; PCT=67.76; ALN=396
PROTSIM ORG=9606; PROTGI=4506649; PROTID=NP_000958.1; PCT=99.50; ALN=401
PROTSIM ORG=7719; PROTGI=198435984; PROTID=XP_002132034.1; PCT=80.15; ALN=402
PROTSIM ORG=13616; PROTGI=126339540; PROTID=XP_001366844.1; PCT=97.05; ALN=405
PROTSIM ORG=9615; PROTGI=73969069; PROTID=XP_531732.2; PCT=99.50; ALN=401
PROTSIM ORG=3218; PROTGI=168040500; PROTID=XP_001772732.1; PCT=73.01; ALN=388
PROTSIM ORG=9913; PROTGI=27807287; PROTID=NP_777140.1; PCT=99.26; ALN=401
PROTSIM ORG=8355; PROTGI=148228673; PROTID=NP_001080341.1; PCT=94.04; ALN=401
PROTSIM ORG=9258; PROTGI=149585070; PROTID=XP_001514408.1; PCT=97.29; ALN=367
PROTSIM ORG=7955; PROTGI=48597014; PROTID=NP_001001590.1; PCT=92.31; ALN=401
PROTSIM ORG=7227; PROTGI=17737907; PROTID=NP_524316.1; PCT=78.91; ALN=402
PROTSIM ORG=28985; PROTGI=50311593; PROTID=XP_455822.1; PCT=72.28; ALN=385
PROTSIM ORG=9796; PROTGI=149743311; PROTID=XP_001501957.1; PCT=100.00; ALN=401
PROTSIM ORG=4896; PROTGI=19115692; PROTID=NP_594780.1; PCT=74.61; ALN=385
PROTSIM ORG=4932; PROTGI=6324637; PROTID=NP_014706.1; PCT=72.02; ALN=385
PROTSIM ORG=8364; PROTGI=187607061; PROTID=NP_001120075.1; PCT=94.54; ALN=401
PROTSIM ORG=5911; PROTGI=118389862; PROTID=XP_001027976.1; PCT=64.60; ALN=386
PROTSIM ORG=4530; PROTGI=115487526; PROTID=NP_001066250.1; PCT=73.20; ALN=387
PROTSIM ORG=10116; PROTGI=38454246; PROTID=NP_942048.1; PCT=99.01; ALN=401
PROTSIM ORG=5833; PROTGI=124802670; PROTID=XP_001347556.1; PCT=69.45; ALN=382
PROTSIM ORG=148305; PROTGI=145608344; PROTID=XP_360650.2; PCT=71.28; ALN=389
PROTSIM ORG=7668; PROTGI=115725235; PROTID=XP_791350.2; PCT=81.50; ALN=399
PROTSIM ORG=3702; PROTGI=15218306; PROTID=NP_175009.1; PCT=71.47; ALN=387
PROTSIM ORG=45351; PROTGI=156359547; PROTID=XP_001624829.1; PCT=80.10; ALN=396
PROTSIM ORG=7091; PROTGI=112982798; PROTID=NP_001037126.1; PCT=80.65; ALN=402
PROTSIM ORG=33169; PROTGI=45188079; PROTID=NP_984302.1; PCT=72.35; ALN=386
PROTSIM ORG=7460; PROTGI=66566113; PROTID=XP_624821.1; PCT=80.89; ALN=402
PROTSIM ORG=5141; PROTGI=164424622; PROTID=XP_963317.2; PCT=71.54; ALN=389
PROTSIM ORG=6239; PROTGI=71984538; PROTID=NP_001021254.1; PCT=75.63; ALN=397
PROTSIM ORG=3055; PROTGI=159489312; PROTID=XP_001702641.1; PCT=72.99; ALN=384
PROTSIM ORG=7070; PROTGI=189240524; PROTID=XP_971875.2; PCT=76.67; ALN=402
PROTSIM ORG=7165; PROTGI=158291770; PROTID=XP_313303.4; PCT=76.56; ALN=400
PROTSIM ORG=9544; PROTGI=109094341; PROTID=XP_001095608.1; PCT=99.50; ALN=401
PROTSIM ORG=7176; PROTGI=170041842; PROTID=XP_001848658.1; PCT=74.75; ALN=403
PROTSIM ORG=5888; PROTGI=145475213; PROTID=XP_001423629.1; PCT=66.67; ALN=377
PROTSIM ORG=9031; PROTGI=57525400; PROTID=NP_001006241.1; PCT=97.27; ALN=401
PROTSIM ORG=7029; PROTGI=193580256; PROTID=XP_001951042.1; PCT=79.40; ALN=402
PROTSIM ORG=9598; PROTGI=114608746; PROTID=XP_518669.2; PCT=93.73; ALN=381
SCOUNT 69
SEQUENCE ACC=XM_001501907.1; NID=g149743310; PID=g149743311; SEQTYPE=Model
SEQUENCE ACC=CD469355.1; NID=g31390623; CLONE=LeukoS2_3_H08_A024; END=5'; LID=13776; SEQTYPE=EST; TRACE=891191391
SEQUENCE ACC=CD535273.1; NID=g31577688; CLONE=LeukoN5_1_E01_A027; END=5'; LID=13843; SEQTYPE=EST; TRACE=891188893
SEQUENCE ACC=CD466249.1; NID=g31387517; CLONE=LeukoN2_3_E11_A024; END=3'; LID=13773; SEQTYPE=EST; TRACE=891186094
SEQUENCE ACC=CD465822.1; NID=g31387090; CLONE=LeukoN1_8_C11_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891184882
SEQUENCE ACC=CD466309.1; NID=g31387577; CLONE=LeukoN2_3_E11_A024; END=5'; LID=13773; SEQTYPE=EST; TRACE=891185997
SEQUENCE ACC=CD465893.1; NID=g31387161; CLONE=LeukoN1_8_C11_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185010
SEQUENCE ACC=CD471087.1; NID=g31392355; CLONE=LeukoS5_4_H03_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891193766
SEQUENCE ACC=CD472103.1; NID=g31393371; CLONE=LeukoS6_1_C10_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891194618
SEQUENCE ACC=CD471067.1; NID=g31392335; CLONE=LeukoS5_4_H03_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891194175
SEQUENCE ACC=CD470984.1; NID=g31392252; CLONE=LeukoS5_3_A04_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891194114
SEQUENCE ACC=CD472154.1; NID=g31393422; CLONE=LeukoS6_1_H10_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891195062
SEQUENCE ACC=CD467586.1; NID=g31388854; CLONE=LeukoS1_5_G11_A023; END=5'; LID=13774; SEQTYPE=EST; TRACE=891190912
SEQUENCE ACC=CD465578.1; NID=g31386846; CLONE=LeukoN1_5_F07_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185096
SEQUENCE ACC=CD465858.1; NID=g31387126; CLONE=LeukoN1_8_H04_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891185129
SEQUENCE ACC=CD471796.1; NID=g31393064; CLONE=LeukoS6_2_D05_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891194917
SEQUENCE ACC=CD471724.1; NID=g31392992; CLONE=LeukoS6_4_F09_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891195058
SEQUENCE ACC=CD471610.1; NID=g31392878; CLONE=LeukoS6_4_F09_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891194846
SEQUENCE ACC=CD535233.1; NID=g31577648; CLONE=LeukoN5_1_B08_A027; END=5'; LID=13843; SEQTYPE=EST; TRACE=891188752
SEQUENCE ACC=CD464482.1; NID=g31385750; CLONE=LeukoN4_4_D01_A026; END=3'; LID=13771; SEQTYPE=EST; TRACE=891187846
SEQUENCE ACC=CD468770.1; NID=g31390038; CLONE=LeukoS3_5_D03_A025; END=5'; LID=13775; SEQTYPE=EST; TRACE=891192280
SEQUENCE ACC=CD470966.1; NID=g31392234; CLONE=LeukoS5_3_F05_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891193981
SEQUENCE ACC=CD536398.1; NID=g31578813; CLONE=LeukoN6_7_C09_A028; END=5'; LID=13844; SEQTYPE=EST; TRACE=891184341
SEQUENCE ACC=CD535162.1; NID=g31577577; CLONE=LeukoN5_1_E01_A027; END=3'; LID=13843; SEQTYPE=EST; TRACE=891188481
SEQUENCE ACC=CD536377.1; NID=g31578792; CLONE=LeukoN6_7_C09_A028; END=3'; LID=13844; SEQTYPE=EST; TRACE=891184713
SEQUENCE ACC=CX603705.1; NID=g57720427; CLONE=CT02036A2C09; LID=16895; SEQTYPE=EST
SEQUENCE ACC=CD470556.1; NID=g31391824; CLONE=LeukoS4_5_C09_A026; END=5'; LID=13777; SEQTYPE=EST; TRACE=891193234
SEQUENCE ACC=CD466384.1; NID=g31387652; CLONE=LeukoN2_3_A06_A024; END=5'; LID=13773; SEQTYPE=EST; TRACE=891186500
SEQUENCE ACC=CD472213.1; NID=g31393481; CLONE=LeukoS6_1_C10_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891194841
SEQUENCE ACC=DN510913.1; NID=g60721103; CLONE=HL02021A2C09; LID=17147; SEQTYPE=EST
SEQUENCE ACC=CD472329.1; NID=g31393597; CLONE=LeukoS6_5_H05_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891195397
SEQUENCE ACC=CD536804.1; NID=g31579219; CLONE=LeukoN6_6_F12_A028; END=3'; LID=13844; SEQTYPE=EST; TRACE=891184838
SEQUENCE ACC=CD466071.1; NID=g31387339; CLONE=LeukoN2_1_E06_A024; END=5'; LID=13773; SEQTYPE=EST; TRACE=891186571
SEQUENCE ACC=CD471909.1; NID=g31393177; CLONE=LeukoS6_2_D05_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891195132
SEQUENCE ACC=CD465985.1; NID=g31387253; CLONE=LeukoN2_1_E06_A024; END=3'; LID=13773; SEQTYPE=EST; TRACE=891186530
SEQUENCE ACC=CD471244.1; NID=g31392512; CLONE=LeukoS5_1_F09_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891193761
SEQUENCE ACC=DN508615.1; NID=g60718805; CLONE=HL02013A1F04; LID=17147; SEQTYPE=EST
SEQUENCE ACC=CD465146.1; NID=g31386414; CLONE=LeukoN1_2_E04_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891185503
SEQUENCE ACC=CD465620.1; NID=g31386888; CLONE=LeukoN1_6_E09_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891184965
SEQUENCE ACC=CD472042.1; NID=g31393310; CLONE=LeukoS6_3_F02_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891194801
SEQUENCE ACC=CD471148.1; NID=g31392416; CLONE=LeukoS5_4_F11_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891194177
SEQUENCE ACC=CD466241.1; NID=g31387509; CLONE=LeukoN2_3_A06_A024; END=3'; LID=13773; SEQTYPE=EST; TRACE=891186007
SEQUENCE ACC=CD472182.1; NID=g31393450; CLONE=LeukoS6_1_H10_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891194593
SEQUENCE ACC=CD470462.1; NID=g31391730; CLONE=LeukoS4_5_C12_A026; END=3'; LID=13777; SEQTYPE=EST; TRACE=891193322
SEQUENCE ACC=CX596662.1; NID=g57706330; CLONE=CT020015A10A06; LID=16895; SEQTYPE=EST
SEQUENCE ACC=CD536825.1; NID=g31579240; CLONE=LeukoN6_6_F12_A028; END=5'; LID=13844; SEQTYPE=EST; TRACE=891184460
SEQUENCE ACC=CD471373.1; NID=g31392641; CLONE=LeukoS5_5_H05_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891194564
SEQUENCE ACC=CD470875.1; NID=g31392143; CLONE=LeukoS5_3_F05_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891193793
SEQUENCE ACC=CD470392.1; NID=g31391660; CLONE=LeukoS4_4_G05_A026; END=3'; LID=13777; SEQTYPE=EST; TRACE=891193723
SEQUENCE ACC=DN509023.1; NID=g60719213; CLONE=HL02014B1D10; LID=17147; SEQTYPE=EST
SEQUENCE ACC=CD471226.1; NID=g31392494; CLONE=LeukoS5_1_F09_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891194172
SEQUENCE ACC=CD470547.1; NID=g31391815; CLONE=LeukoS4_5_C12_A026; END=5'; LID=13777; SEQTYPE=EST; TRACE=891193220
SEQUENCE ACC=CX593399.1; NID=g57699803; CLONE=CT020005A10B06; LID=16895; SEQTYPE=EST
SEQUENCE ACC=CD465261.1; NID=g31386529; CLONE=LeukoN1_2_E04_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185735
SEQUENCE ACC=CD470468.1; NID=g31391736; CLONE=LeukoS4_5_C09_A026; END=3'; LID=13777; SEQTYPE=EST; TRACE=891193330
SEQUENCE ACC=CX594874.1; NID=g57702753; CLONE=CT020009B10E10; LID=16895; SEQTYPE=EST
SEQUENCE ACC=CD464520.1; NID=g31385788; CLONE=LeukoN4_4_D01_A026; END=5'; LID=13771; SEQTYPE=EST; TRACE=891188021
SEQUENCE ACC=AW260814.1; NID=g6636524; END=5'; LID=2777; SEQTYPE=EST
SEQUENCE ACC=CD465691.1; NID=g31386959; CLONE=LeukoN1_6_E09_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185089
SEQUENCE ACC=CX595434.1; NID=g57703875; CLONE=CT020011A20F11; LID=16895; SEQTYPE=EST
SEQUENCE ACC=CD464759.1; NID=g31386027; CLONE=LeukoN4_6_A09_A026; END=5'; LID=13771; SEQTYPE=EST; TRACE=891188174
SEQUENCE ACC=CD469441.1; NID=g31390709; CLONE=LeukoS2_3_F03_A024; END=5'; LID=13776; SEQTYPE=EST; TRACE=946412740
SEQUENCE ACC=CD471416.1; NID=g31392684; CLONE=LeukoS5_5_H05_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891194460
SEQUENCE ACC=CD536439.1; NID=g31578854; CLONE=LeukoN6_7_B04_A028; END=5'; LID=13844; SEQTYPE=EST; TRACE=891184662
SEQUENCE ACC=CX605282.1; NID=g57723597; CLONE=CT02041A1A07; LID=16895; SEQTYPE=EST
SEQUENCE ACC=CD464302.1; NID=g31385570; CLONE=LeukoN4_3_B01_A026; END=3'; LID=13771; SEQTYPE=EST; TRACE=891187636
SEQUENCE ACC=CD469295.1; NID=g31390563; CLONE=LeukoS2_3_F03_A024; END=3'; LID=13776; SEQTYPE=EST; TRACE=891191446
SEQUENCE ACC=CD470883.1; NID=g31392151; CLONE=LeukoS5_3_G08_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891193865
SEQUENCE ACC=CD468749.1; NID=g31390017; CLONE=LeukoS3_5_E02_A025; END=5'; LID=13775; SEQTYPE=EST; TRACE=891192160
//
ID Eca.2425
TITLE Immunoglobulin-like transcript 11 protein
GENE ILT11B
CYTOBAND 10p12
GENE_ID 100034238
LOCUSLINK 100034238
HOMOL YES
EXPRESS blood|adult
PROTSIM ORG=10090; PROTGI=156766061; PROTID=NP_001074708.2; PCT=50.00; ALN=281
PROTSIM ORG=9606; PROTGI=32490553; PROTID=NP_067073.1; PCT=62.42; ALN=296
PROTSIM ORG=13616; PROTGI=126346255; PROTID=XP_001375479.1; PCT=40.43; ALN=281
PROTSIM ORG=9615; PROTGI=73947540; PROTID=XP_854148.1; PCT=42.11; ALN=303
PROTSIM ORG=9913; PROTGI=194674905; PROTID=XP_001788837.1; PCT=61.11; ALN=233
PROTSIM ORG=9258; PROTGI=149631692; PROTID=XP_001516158.1; PCT=35.65; ALN=229
PROTSIM ORG=9796; PROTGI=146262003; PROTID=NP_001075996.1; PCT=100.00; ALN=279
PROTSIM ORG=10116; PROTGI=109461031; PROTID=XP_001065532.1; PCT=52.77; ALN=269
PROTSIM ORG=9544; PROTGI=100818611; PROTID=NP_001035767.1; PCT=57.35; ALN=277
PROTSIM ORG=9823; PROTGI=178057314; PROTID=NP_001116615.1; PCT=43.01; ALN=278
PROTSIM ORG=9598; PROTGI=58801536; PROTID=NP_001009045.1; PCT=54.51; ALN=276
SCOUNT 9
SEQUENCE ACC=AB120409.1; NID=g45597282; PID=g45597283; SEQTYPE=mRNA
SEQUENCE ACC=NM_001082527.1; NID=g146262002; PID=g146262003; SEQTYPE=mRNA
SEQUENCE ACC=AB120410.1; NID=g45597284; PID=g45597285; SEQTYPE=mRNA
SEQUENCE ACC=AB120411.1; NID=g45597286; PID=g45597287; SEQTYPE=mRNA
SEQUENCE ACC=CD467202.1; NID=g31388470; CLONE=LeukoS1_2_D11_A023; END=5'; LID=13774; SEQTYPE=EST; TRACE=891190473
SEQUENCE ACC=CD535974.1; NID=g31578389; CLONE=LeukoN6_2_F07_A028; END=3'; LID=13844; SEQTYPE=EST; TRACE=891184067
SEQUENCE ACC=CD469544.1; NID=g31390812; CLONE=LeukoS2_4_D05_A024; END=5'; LID=13776; SEQTYPE=EST; TRACE=891191700
SEQUENCE ACC=CD528742.1; NID=g31567364; CLONE=LeukoN3_7_F08_A025; END=3'; LID=13842; SEQTYPE=EST; TRACE=891187302
SEQUENCE ACC=CD469487.1; NID=g31390755; CLONE=LeukoS2_4_D05_A024; END=3'; LID=13776; SEQTYPE=EST; TRACE=891191801
//

70
Tests/UniGene/Hs.2.data Normal file
View File

@ -0,0 +1,70 @@
ID Hs.2
TITLE N-acetyltransferase 2 (arylamine N-acetyltransferase)
GENE NAT2
CYTOBAND 8p22
GENE_ID 10
LOCUSLINK 10
HOMOL YES
EXPRESS bone| connective tissue| intestine| liver| liver tumor| normal| soft tissue/muscle tissue tumor| adult
RESTR_EXPR adult
CHROMOSOME 8
STS ACC=PMC310725P3 UNISTS=272646
STS ACC=WIAF-2120 UNISTS=44576
STS ACC=G59899 UNISTS=137181
STS ACC=G06461 UNISTS=17088
STS ACC=GDB:310612 UNISTS=156422
STS ACC=GDB:310613 UNISTS=156423
STS ACC=GDB:187676 UNISTS=155563
PROTSIM ORG=10090; PROTGI=6754794; PROTID=NP_035004.1; PCT=76.55; ALN=288
PROTSIM ORG=9796; PROTGI=149742490; PROTID=XP_001487907.1; PCT=79.66; ALN=288
PROTSIM ORG=9986; PROTGI=126722851; PROTID=NP_001075655.1; PCT=76.90; ALN=288
PROTSIM ORG=9544; PROTGI=113461974; PROTID=NP_001038201.1; PCT=93.10; ALN=288
PROTSIM ORG=9606; PROTGI=116295260; PROTID=NP_000006.2; PCT=100.00; ALN=288
PROTSIM ORG=7719; PROTGI=198423589; PROTID=XP_002126320.1; PCT=29.46; ALN=240
PROTSIM ORG=10116; PROTGI=16758720; PROTID=NP_446306.1; PCT=75.52; ALN=288
PROTSIM ORG=13616; PROTGI=126303190; PROTID=XP_001371814.1; PCT=63.67; ALN=287
PROTSIM ORG=9031; PROTGI=45384408; PROTID=NP_990671.1; PCT=57.93; ALN=288
PROTSIM ORG=9913; PROTGI=115497526; PROTID=NP_001069040.1; PCT=80.34; ALN=288
PROTSIM ORG=9258; PROTGI=149640224; PROTID=XP_001508302.1; PCT=62.37; ALN=278
PROTSIM ORG=7955; PROTGI=125821897; PROTID=XP_001334322.1; PCT=43.21; ALN=279
PROTSIM ORG=9598; PROTGI=114619004; PROTID=XP_519631.2; PCT=98.28; ALN=288
SCOUNT 38
SEQUENCE ACC=BC067218.1; NID=g45501306; PID=g45501307; SEQTYPE=mRNA
SEQUENCE ACC=NM_000015.2; NID=g116295259; PID=g116295260; SEQTYPE=mRNA
SEQUENCE ACC=D90042.1; NID=g219415; PID=g219416; SEQTYPE=mRNA
SEQUENCE ACC=D90040.1; NID=g219411; PID=g219412; SEQTYPE=mRNA
SEQUENCE ACC=BC015878.1; NID=g16198419; PID=g16198420; SEQTYPE=mRNA
SEQUENCE ACC=CR407631.1; NID=g47115198; PID=g47115199; SEQTYPE=mRNA
SEQUENCE ACC=BG569293.1; NID=g13576946; CLONE=IMAGE:4722596; END=5'; LID=6989; SEQTYPE=EST; TRACE=44157214
SEQUENCE ACC=AI792606.1; NID=g5340322; CLONE=IMAGE:1870937; END=5'; LID=1079; SEQTYPE=EST
SEQUENCE ACC=BG568794.1; NID=g13576447; CLONE=IMAGE:4716636; END=5'; LID=6989; SEQTYPE=EST; TRACE=44156817
SEQUENCE ACC=BG533459.1; NID=g13524999; CLONE=IMAGE:4072143; END=5'; LID=6989; SEQTYPE=EST; TRACE=44404609
SEQUENCE ACC=BG568400.1; NID=g13576053; CLONE=IMAGE:4716802; END=5'; LID=6989; SEQTYPE=EST; TRACE=44156561
SEQUENCE ACC=BG618195.1; NID=g13669566; CLONE=IMAGE:4767316; END=5'; LID=6989; SEQTYPE=EST; TRACE=45338366
SEQUENCE ACC=BG563731.1; NID=g13571383; CLONE=IMAGE:4712210; END=5'; LID=6989; SEQTYPE=EST; TRACE=44153506
SEQUENCE ACC=AI733799.1; NID=g5054912; CLONE=IMAGE:1870937; END=3'; LID=1079; SEQTYPE=EST
SEQUENCE ACC=BG569272.1; NID=g13576925; CLONE=IMAGE:4722638; END=5'; LID=6989; SEQTYPE=EST; TRACE=44157191
SEQUENCE ACC=AJ581147.1; NID=g73759744; PID=g73759745; SEQTYPE=mRNA
SEQUENCE ACC=BU624903.1; NID=g23291118; CLONE=UI-H-FG1-bgl-g-02-0-UI; END=3'; LID=11914; SEQTYPE=EST; TRACE=159705553
SEQUENCE ACC=BG617259.1; NID=g13668630; CLONE=IMAGE:4734378; END=5'; LID=6989; SEQTYPE=EST; TRACE=44229423
SEQUENCE ACC=CV029049.1; NID=g51487181; END=5'; LID=16264; SEQTYPE=EST
SEQUENCE ACC=BP264356.1; NID=g52179587; CLONE=HSI15615; END=5'; LID=16400; SEQTYPE=EST
SEQUENCE ACC=BP262043.1; NID=g52177274; CLONE=HSI05750; END=5'; LID=16400; SEQTYPE=EST
SEQUENCE ACC=BP262787.1; NID=g52178018; CLONE=HSI08034; END=5'; LID=16400; SEQTYPE=EST
SEQUENCE ACC=CB161982.1; NID=g28148108; CLONE=L17N670205n1-15-F12; END=5'; LID=12542; SEQTYPE=EST
SEQUENCE ACC=BX095770.1; NID=g27827877; CLONE=IMAGp998I184581_,_IMAGE:1870937; LID=1079; SEQTYPE=EST
SEQUENCE ACC=AI262683.1; NID=g3870886; CLONE=IMAGE:1870937; END=3'; LID=1079; SEQTYPE=EST
SEQUENCE ACC=AI460128.1; NID=g4313009; CLONE=IMAGE:2151449; END=3'; LID=1556; SEQTYPE=EST
SEQUENCE ACC=CB161860.1; NID=g28147986; CLONE=L17N670205n1-41-A04; END=5'; LID=12542; SEQTYPE=EST
SEQUENCE ACC=AJ581145.1; NID=g73759740; PID=g73759741; SEQTYPE=mRNA
SEQUENCE ACC=EG327405.1; NID=g116004924; CLONE=MGC12part.1.3.L1.1.D01.; LID=17261; SEQTYPE=EST
SEQUENCE ACC=AJ581144.1; NID=g73759738; PID=g73759739; SEQTYPE=mRNA
SEQUENCE ACC=EG327340.1; NID=g116004859; CLONE=MGC12part.1.2.L1.1.D01.; LID=17261; SEQTYPE=EST
SEQUENCE ACC=AV658623.1; NID=g9879637; CLONE=GLCFOD10; END=3'; LID=5601; SEQTYPE=EST
SEQUENCE ACC=AV658656.1; NID=g9879670; CLONE=GLCFOG07; END=3'; LID=5601; SEQTYPE=EST
SEQUENCE ACC=AV684197.2; NID=g55943471; CLONE=GKCFZH06; END=5'; LID=6533; SEQTYPE=EST
SEQUENCE ACC=AJ581146.1; NID=g73759742; PID=g73759743; SEQTYPE=mRNA
SEQUENCE ACC=EG327618.1; NID=g116005137; CLONE=MGC11part.2.3.L1.1.E06.; LID=17261; SEQTYPE=EST
SEQUENCE ACC=BG204539.1; NID=g13726226; LID=8655; SEQTYPE=EST
SEQUENCE ACC=AU099534.1; NID=g13550663; CLONE=HSI08034; END=5'; LID=8800; SEQTYPE=EST
//

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,34 @@
from Bio import UniGene
#Start of the UniGene file for Monodelphis domestica downloaded from:
#ftp://ftp.ncbi.nih.gov/repository/UniGene/Monodelphis_domestica
handle = open("UniGene/Mdm_partial.data")
ugparser = UniGene.Iterator(handle, UniGene.RecordParser())
for record in ugparser:
assert isinstance(record.ID, str)
assert isinstance(record.title, str)
assert isinstance(record.species, str)
assert isinstance(record.express, list)
assert isinstance(record.sequence, list)
print record.ID
print "Title: '%s'" % record.title
print "Expressed:", record.express
print "Chromosome:", record.chromosome
if record.sequence :
print "Sequences:"
for s in record.sequence :
assert isinstance(s, UniGene.UnigeneSequenceRecord)
print s
else :
print "No sequences"
assert record.species == "Mdm"
#Should be no PROTSIM lines in this file!
assert isinstance(record.protsim, list)
assert len(record.protsim) == 0
print
print "Done"
handle.close()