mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
Updating the UniGene parser to be consistent with current UniGene files.
This commit is contained in:
@ -3,7 +3,7 @@
|
||||
# license. Please see the LICENSE file that should have been included
|
||||
# as part of this package.
|
||||
#
|
||||
# $Id: __init__.py,v 1.11 2008-07-19 12:43:40 peterc Exp $
|
||||
# $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $
|
||||
# Sean Davis <sdavis2 at mail dot nih dot gov>
|
||||
# National Cancer Institute
|
||||
# National Institutes of Health
|
||||
@ -29,20 +29,25 @@ Here is an overview of the flat file format that this parser deals with:
|
||||
I if templated As are found in genomic sequence or
|
||||
S if a canonical polyA signal is found on
|
||||
the genomic sequence
|
||||
GENE_ID Entrez gene identifier associated with at least one sequence in this cluster;
|
||||
GENE_ID Entrez gene identifier associated with at least one
|
||||
sequence in this cluster;
|
||||
to be used instead of LocusLink.
|
||||
LOCUSLINK LocusLink identifier associated with at least one sequence in this cluster;
|
||||
LOCUSLINK LocusLink identifier associated with at least one
|
||||
sequence in this cluster;
|
||||
deprecated in favor of GENE_ID
|
||||
CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping on the arabidopsis genome.
|
||||
HOMOL Homology;
|
||||
CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
|
||||
on the arabidopsis genome.
|
||||
STS STS
|
||||
NAME= Name of STS
|
||||
ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
|
||||
DSEG= GDB Dsegment number [optional field]
|
||||
ACC= GenBank/EMBL/DDBJ accession number of STS
|
||||
[optional field]
|
||||
UNISTS= identifier in NCBI's UNISTS database
|
||||
TXMAP Transcript map interval
|
||||
MARKER= Marker found on at least one sequence in this cluster
|
||||
MARKER= Marker found on at least one sequence in this
|
||||
cluster
|
||||
RHPANEL= Radiation Hybrid panel used to place marker
|
||||
PROTSIM Protein Similarity data for the sequence with highest-scoring protein similarity in this cluster
|
||||
PROTSIM Protein Similarity data for the sequence with
|
||||
highest-scoring protein similarity in this cluster
|
||||
ORG= Organism
|
||||
PROTGI= Sequence GI of protein
|
||||
PROTID= Sequence ID of protein
|
||||
@ -52,30 +57,276 @@ Here is an overview of the flat file format that this parser deals with:
|
||||
SEQUENCE Sequence
|
||||
ACC= GenBank/EMBL/DDBJ accession number of sequence
|
||||
NID= Unique nucleotide sequence identifier (gi)
|
||||
PID= Unique protein sequence identifier (used for non-ESTs)
|
||||
PID= Unique protein sequence identifier (used for
|
||||
non-ESTs)
|
||||
CLONE= Clone identifier (used for ESTs only)
|
||||
END= End (5'/3') of clone insert read (used for ESTs only)
|
||||
LID= Library ID; see Hs.lib.info for library name and tissue
|
||||
MGC= 5' CDS-completeness indicator; if present,
|
||||
the clone associated with this sequence
|
||||
is believed CDS-complete. A value greater than 511
|
||||
is the gi of the CDS-complete mRNA matched by the EST,
|
||||
otherwise the value is an indicator of the reliability
|
||||
of the test indicating CDS comleteness;
|
||||
higher values indicate more reliable CDS-completeness predictions.
|
||||
SEQTYPE= Description of the nucleotide sequence. Possible values are
|
||||
mRNA, EST and HTC.
|
||||
TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
|
||||
PERIPHERAL= Indicator that the sequence is a suboptimal
|
||||
representative of the gene represented by this cluster.
|
||||
Peripheral sequences are those that are in a cluster
|
||||
which represents a spliced gene without sharing a
|
||||
splice junction with any other sequence. In many
|
||||
cases, they are unspliced transcripts originating
|
||||
from the gene.
|
||||
|
||||
// End of record
|
||||
END= End (5'/3') of clone insert read (used for
|
||||
ESTs only)
|
||||
LID= Library ID; see Hs.lib.info for library name
|
||||
and tissue
|
||||
MGC= 5' CDS-completeness indicator; if present, the
|
||||
clone associated with this sequence is believed
|
||||
CDS-complete. A value greater than 511 is the gi
|
||||
of the CDS-complete mRNA matched by the EST,
|
||||
otherwise the value is an indicator of the
|
||||
reliability of the test indicating CDS
|
||||
completeness; higher values indicate more
|
||||
reliable CDS-completeness predictions.
|
||||
SEQTYPE= Description of the nucleotide sequence.
|
||||
Possible values are mRNA, EST and HTC.
|
||||
TRACE= The Trace ID of the EST sequence, as provided by
|
||||
NCBI Trace Archive
|
||||
"""
|
||||
|
||||
|
||||
class SequenceLine:
|
||||
"""Store the information for one SEQUENCE line from a Unigene file
|
||||
|
||||
Initialize with the text part of the SEQUENCE line, or nothing.
|
||||
|
||||
Attributes and descriptions (access as LOWER CASE)
|
||||
ACC= GenBank/EMBL/DDBJ accession number of sequence
|
||||
NID= Unique nucleotide sequence identifier (gi)
|
||||
PID= Unique protein sequence identifier (used for non-ESTs)
|
||||
CLONE= Clone identifier (used for ESTs only)
|
||||
END= End (5'/3') of clone insert read (used for ESTs only)
|
||||
LID= Library ID; see Hs.lib.info for library name and tissue
|
||||
MGC= 5' CDS-completeness indicator; if present,
|
||||
the clone associated with this sequence
|
||||
is believed CDS-complete. A value greater than 511
|
||||
is the gi of the CDS-complete mRNA matched by the EST,
|
||||
otherwise the value is an indicator of the reliability
|
||||
of the test indicating CDS completeness;
|
||||
higher values indicate more reliable CDS-completeness
|
||||
predictions.
|
||||
SEQTYPE= Description of the nucleotide sequence. Possible values
|
||||
are mRNA, EST and HTC.
|
||||
TRACE= The Trace ID of the EST sequence, as provided by NCBI
|
||||
Trace Archive
|
||||
"""
|
||||
|
||||
def __init__(self,text=None):
|
||||
self.acc = ''
|
||||
self.nid = ''
|
||||
self.lid = ''
|
||||
self.pid = ''
|
||||
self.clone = ''
|
||||
self.image = ''
|
||||
self.is_image = False
|
||||
self.end = ''
|
||||
self.mgc = ''
|
||||
self.seqtype = ''
|
||||
self.trace = ''
|
||||
if not text==None:
|
||||
self.text=text
|
||||
self._init_from_text(text)
|
||||
|
||||
def _init_from_text(self,text):
|
||||
parts = text.split('; ');
|
||||
for part in parts:
|
||||
key, val = part.split("=")
|
||||
if key=='CLONE':
|
||||
if val[:5]=='IMAGE':
|
||||
self.is_image=True
|
||||
self.image = val[6:]
|
||||
setattr(self,key.lower(),val)
|
||||
|
||||
def __repr__(self):
|
||||
return self.text
|
||||
|
||||
|
||||
class ProtsimLine:
|
||||
"""Store the information for one PROTSIM line from a Unigene file
|
||||
|
||||
Initialize with the text part of the PROTSIM line, or nothing.
|
||||
|
||||
Attributes and descriptions (access as LOWER CASE)
|
||||
ORG= Organism
|
||||
PROTGI= Sequence GI of protein
|
||||
PROTID= Sequence ID of protein
|
||||
PCT= Percent alignment
|
||||
ALN= length of aligned region (aa)
|
||||
"""
|
||||
|
||||
def __init__(self,text=None):
|
||||
self.org = ''
|
||||
self.protgi = ''
|
||||
self.protid = ''
|
||||
self.pct = ''
|
||||
self.aln = ''
|
||||
if not text==None:
|
||||
self.text=text
|
||||
self._init_from_text(text)
|
||||
|
||||
def _init_from_text(self,text):
|
||||
parts = text.split('; ');
|
||||
|
||||
for part in parts:
|
||||
key, val = part.split("=")
|
||||
setattr(self,key.lower(),val)
|
||||
|
||||
def __repr__(self):
|
||||
return self.text
|
||||
|
||||
|
||||
class STSLine:
|
||||
"""Store the information for one STS line from a Unigene file
|
||||
|
||||
Initialize with the text part of the STS line, or nothing.
|
||||
|
||||
Attributes and descriptions (access as LOWER CASE)
|
||||
|
||||
ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
|
||||
UNISTS= identifier in NCBI's UNISTS database
|
||||
"""
|
||||
|
||||
def __init__(self,text=None):
|
||||
self.acc = ''
|
||||
self.unists = ''
|
||||
if not text==None:
|
||||
self.text=text
|
||||
self._init_from_text(text)
|
||||
|
||||
def _init_from_text(self,text):
|
||||
parts = text.split(' ');
|
||||
|
||||
for part in parts:
|
||||
key, val = part.split("=")
|
||||
setattr(self,key.lower(),val)
|
||||
|
||||
def __repr__(self):
|
||||
return self.text
|
||||
|
||||
|
||||
class Record:
|
||||
"""Store a Unigene record
|
||||
|
||||
Here is what is stored:
|
||||
|
||||
self.ID = '' # ID line
|
||||
self.species = '' # Hs, Bt, etc.
|
||||
self.title = '' # TITLE line
|
||||
self.symbol = '' # GENE line
|
||||
self.cytoband = '' # CYTOBAND line
|
||||
self.express = [] # EXPRESS line, parsed on ';'
|
||||
# Will be an array of strings
|
||||
self.restr_expr = '' # RESTR_EXPR line
|
||||
self.gnm_terminus = '' # GNM_TERMINUS line
|
||||
self.gene_id = '' # GENE_ID line
|
||||
self.locuslink = '' # LOCUSLINK line
|
||||
self.homol = '' # HOMOL line
|
||||
self.chromosome = '' # CHROMOSOME line
|
||||
self.protsim = [] # PROTSIM entries, array of Protsims
|
||||
# Type ProtsimLine
|
||||
self.sequence = [] # SEQUENCE entries, array of Sequence entries
|
||||
# Type SequenceLine
|
||||
self.sts = [] # STS entries, array of STS entries
|
||||
# Type STSLine
|
||||
self.txmap = [] # TXMAP entries, array of TXMap entries
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ID = '' # ID line
|
||||
self.species = '' # Hs, Bt, etc.
|
||||
self.title = '' # TITLE line
|
||||
self.symbol = '' # GENE line
|
||||
self.cytoband = '' # CYTOBAND line
|
||||
self.express = [] # EXPRESS line, parsed on ';'
|
||||
self.restr_expr = '' # RESTR_EXPR line
|
||||
self.gnm_terminus = '' # GNM_TERMINUS line
|
||||
self.gene_id = '' # GENE_ID line
|
||||
self.locuslink = '' # LOCUSLINK line
|
||||
self.homol = '' # HOMOL line
|
||||
self.chromosome = '' # CHROMOSOME line
|
||||
self.protsim = [] # PROTSIM entries, array of Protsims
|
||||
self.sequence = [] # SEQUENCE entries, array of Sequence entries
|
||||
self.sts = [] # STS entries, array of STS entries
|
||||
self.txmap = [] # TXMAP entries, array of TXMap entries
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s> %s %s\n%s" % (self.__class__.__name__,
|
||||
self.ID, self.symbol, self.title)
|
||||
|
||||
|
||||
def parse(handle):
|
||||
while True:
|
||||
record = _read(handle)
|
||||
if not record:
|
||||
return
|
||||
yield record
|
||||
|
||||
|
||||
def read(handle):
|
||||
record = _read(handle)
|
||||
if not record:
|
||||
raise ValueError("No SwissProt record found")
|
||||
# We should have reached the end of the record by now
|
||||
remainder = handle.read()
|
||||
if remainder:
|
||||
raise ValueError("More than one SwissProt record found")
|
||||
return record
|
||||
|
||||
|
||||
# Everything below is private
|
||||
|
||||
|
||||
def _read(handle):
|
||||
UG_INDENT = 12
|
||||
record = None
|
||||
for line in handle:
|
||||
tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
|
||||
line = line.rstrip()
|
||||
if tag=="ID":
|
||||
record = Record()
|
||||
record.ID = value
|
||||
record.species = record.ID.split('.')[0]
|
||||
elif tag=="TITLE":
|
||||
record.title = value
|
||||
elif tag=="GENE":
|
||||
record.symbol = value
|
||||
elif tag=="GENE_ID":
|
||||
record.gene_id = value
|
||||
elif tag=="LOCUSLINK":
|
||||
record.locuslink = value
|
||||
elif tag=="HOMOL":
|
||||
if value=="YES":
|
||||
record.homol = True
|
||||
elif value=="NO":
|
||||
record.homol = True
|
||||
else:
|
||||
raise ValueError, "Cannot parse HOMOL line %s" % line
|
||||
elif tag=="EXPRESS":
|
||||
record.express = [word.strip() for word in value.split("|")]
|
||||
elif tag=="RESTR_EXPR":
|
||||
record.restr_expr = [word.strip() for word in value.split("|")]
|
||||
elif tag=="CHROMOSOME":
|
||||
record.chromosome = value
|
||||
elif tag=="CYTOBAND":
|
||||
record.cytoband = value
|
||||
elif tag=="PROTSIM":
|
||||
protsim = ProtsimLine(value)
|
||||
record.protsim.append(protsim)
|
||||
elif tag=="SCOUNT":
|
||||
scount = int(value)
|
||||
elif tag=="SEQUENCE":
|
||||
sequence = SequenceLine(value)
|
||||
record.sequence.append(sequence)
|
||||
elif tag=="STS":
|
||||
sts = STSLine(value)
|
||||
record.sts.append(sts)
|
||||
elif tag=='//':
|
||||
if len(record.sequence)!=scount:
|
||||
raise ValueError, "The number of sequences specified in the record (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence))
|
||||
return record
|
||||
else:
|
||||
raise ValueError, "Unknown tag %s" % tag
|
||||
if record:
|
||||
raise ValueError("Unexpected end of stream.")
|
||||
|
||||
|
||||
# Everything below is considered obsolete
|
||||
|
||||
|
||||
from Bio.ParserSupport import *
|
||||
import re
|
||||
|
||||
|
149
Tests/UniGene/Eca.1.2425.data
Normal file
149
Tests/UniGene/Eca.1.2425.data
Normal file
@ -0,0 +1,149 @@
|
||||
ID Eca.1
|
||||
TITLE Ribosomal protein L3
|
||||
GENE RPL3
|
||||
GENE_ID 100070291
|
||||
LOCUSLINK 100070291
|
||||
HOMOL YES
|
||||
EXPRESS blood| cartilage| trophoblast|adult
|
||||
RESTR_EXPR blood|adult
|
||||
PROTSIM ORG=10090; PROTGI=149270989; PROTID=XP_001477314.1; PCT=99.26; ALN=401
|
||||
PROTSIM ORG=5207; PROTGI=134117057; PROTID=XP_772755.1; PCT=70.91; ALN=384
|
||||
PROTSIM ORG=44689; PROTGI=66802278; PROTID=XP_629921.1; PCT=67.76; ALN=396
|
||||
PROTSIM ORG=9606; PROTGI=4506649; PROTID=NP_000958.1; PCT=99.50; ALN=401
|
||||
PROTSIM ORG=7719; PROTGI=198435984; PROTID=XP_002132034.1; PCT=80.15; ALN=402
|
||||
PROTSIM ORG=13616; PROTGI=126339540; PROTID=XP_001366844.1; PCT=97.05; ALN=405
|
||||
PROTSIM ORG=9615; PROTGI=73969069; PROTID=XP_531732.2; PCT=99.50; ALN=401
|
||||
PROTSIM ORG=3218; PROTGI=168040500; PROTID=XP_001772732.1; PCT=73.01; ALN=388
|
||||
PROTSIM ORG=9913; PROTGI=27807287; PROTID=NP_777140.1; PCT=99.26; ALN=401
|
||||
PROTSIM ORG=8355; PROTGI=148228673; PROTID=NP_001080341.1; PCT=94.04; ALN=401
|
||||
PROTSIM ORG=9258; PROTGI=149585070; PROTID=XP_001514408.1; PCT=97.29; ALN=367
|
||||
PROTSIM ORG=7955; PROTGI=48597014; PROTID=NP_001001590.1; PCT=92.31; ALN=401
|
||||
PROTSIM ORG=7227; PROTGI=17737907; PROTID=NP_524316.1; PCT=78.91; ALN=402
|
||||
PROTSIM ORG=28985; PROTGI=50311593; PROTID=XP_455822.1; PCT=72.28; ALN=385
|
||||
PROTSIM ORG=9796; PROTGI=149743311; PROTID=XP_001501957.1; PCT=100.00; ALN=401
|
||||
PROTSIM ORG=4896; PROTGI=19115692; PROTID=NP_594780.1; PCT=74.61; ALN=385
|
||||
PROTSIM ORG=4932; PROTGI=6324637; PROTID=NP_014706.1; PCT=72.02; ALN=385
|
||||
PROTSIM ORG=8364; PROTGI=187607061; PROTID=NP_001120075.1; PCT=94.54; ALN=401
|
||||
PROTSIM ORG=5911; PROTGI=118389862; PROTID=XP_001027976.1; PCT=64.60; ALN=386
|
||||
PROTSIM ORG=4530; PROTGI=115487526; PROTID=NP_001066250.1; PCT=73.20; ALN=387
|
||||
PROTSIM ORG=10116; PROTGI=38454246; PROTID=NP_942048.1; PCT=99.01; ALN=401
|
||||
PROTSIM ORG=5833; PROTGI=124802670; PROTID=XP_001347556.1; PCT=69.45; ALN=382
|
||||
PROTSIM ORG=148305; PROTGI=145608344; PROTID=XP_360650.2; PCT=71.28; ALN=389
|
||||
PROTSIM ORG=7668; PROTGI=115725235; PROTID=XP_791350.2; PCT=81.50; ALN=399
|
||||
PROTSIM ORG=3702; PROTGI=15218306; PROTID=NP_175009.1; PCT=71.47; ALN=387
|
||||
PROTSIM ORG=45351; PROTGI=156359547; PROTID=XP_001624829.1; PCT=80.10; ALN=396
|
||||
PROTSIM ORG=7091; PROTGI=112982798; PROTID=NP_001037126.1; PCT=80.65; ALN=402
|
||||
PROTSIM ORG=33169; PROTGI=45188079; PROTID=NP_984302.1; PCT=72.35; ALN=386
|
||||
PROTSIM ORG=7460; PROTGI=66566113; PROTID=XP_624821.1; PCT=80.89; ALN=402
|
||||
PROTSIM ORG=5141; PROTGI=164424622; PROTID=XP_963317.2; PCT=71.54; ALN=389
|
||||
PROTSIM ORG=6239; PROTGI=71984538; PROTID=NP_001021254.1; PCT=75.63; ALN=397
|
||||
PROTSIM ORG=3055; PROTGI=159489312; PROTID=XP_001702641.1; PCT=72.99; ALN=384
|
||||
PROTSIM ORG=7070; PROTGI=189240524; PROTID=XP_971875.2; PCT=76.67; ALN=402
|
||||
PROTSIM ORG=7165; PROTGI=158291770; PROTID=XP_313303.4; PCT=76.56; ALN=400
|
||||
PROTSIM ORG=9544; PROTGI=109094341; PROTID=XP_001095608.1; PCT=99.50; ALN=401
|
||||
PROTSIM ORG=7176; PROTGI=170041842; PROTID=XP_001848658.1; PCT=74.75; ALN=403
|
||||
PROTSIM ORG=5888; PROTGI=145475213; PROTID=XP_001423629.1; PCT=66.67; ALN=377
|
||||
PROTSIM ORG=9031; PROTGI=57525400; PROTID=NP_001006241.1; PCT=97.27; ALN=401
|
||||
PROTSIM ORG=7029; PROTGI=193580256; PROTID=XP_001951042.1; PCT=79.40; ALN=402
|
||||
PROTSIM ORG=9598; PROTGI=114608746; PROTID=XP_518669.2; PCT=93.73; ALN=381
|
||||
SCOUNT 69
|
||||
SEQUENCE ACC=XM_001501907.1; NID=g149743310; PID=g149743311; SEQTYPE=Model
|
||||
SEQUENCE ACC=CD469355.1; NID=g31390623; CLONE=LeukoS2_3_H08_A024; END=5'; LID=13776; SEQTYPE=EST; TRACE=891191391
|
||||
SEQUENCE ACC=CD535273.1; NID=g31577688; CLONE=LeukoN5_1_E01_A027; END=5'; LID=13843; SEQTYPE=EST; TRACE=891188893
|
||||
SEQUENCE ACC=CD466249.1; NID=g31387517; CLONE=LeukoN2_3_E11_A024; END=3'; LID=13773; SEQTYPE=EST; TRACE=891186094
|
||||
SEQUENCE ACC=CD465822.1; NID=g31387090; CLONE=LeukoN1_8_C11_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891184882
|
||||
SEQUENCE ACC=CD466309.1; NID=g31387577; CLONE=LeukoN2_3_E11_A024; END=5'; LID=13773; SEQTYPE=EST; TRACE=891185997
|
||||
SEQUENCE ACC=CD465893.1; NID=g31387161; CLONE=LeukoN1_8_C11_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185010
|
||||
SEQUENCE ACC=CD471087.1; NID=g31392355; CLONE=LeukoS5_4_H03_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891193766
|
||||
SEQUENCE ACC=CD472103.1; NID=g31393371; CLONE=LeukoS6_1_C10_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891194618
|
||||
SEQUENCE ACC=CD471067.1; NID=g31392335; CLONE=LeukoS5_4_H03_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891194175
|
||||
SEQUENCE ACC=CD470984.1; NID=g31392252; CLONE=LeukoS5_3_A04_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891194114
|
||||
SEQUENCE ACC=CD472154.1; NID=g31393422; CLONE=LeukoS6_1_H10_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891195062
|
||||
SEQUENCE ACC=CD467586.1; NID=g31388854; CLONE=LeukoS1_5_G11_A023; END=5'; LID=13774; SEQTYPE=EST; TRACE=891190912
|
||||
SEQUENCE ACC=CD465578.1; NID=g31386846; CLONE=LeukoN1_5_F07_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185096
|
||||
SEQUENCE ACC=CD465858.1; NID=g31387126; CLONE=LeukoN1_8_H04_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891185129
|
||||
SEQUENCE ACC=CD471796.1; NID=g31393064; CLONE=LeukoS6_2_D05_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891194917
|
||||
SEQUENCE ACC=CD471724.1; NID=g31392992; CLONE=LeukoS6_4_F09_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891195058
|
||||
SEQUENCE ACC=CD471610.1; NID=g31392878; CLONE=LeukoS6_4_F09_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891194846
|
||||
SEQUENCE ACC=CD535233.1; NID=g31577648; CLONE=LeukoN5_1_B08_A027; END=5'; LID=13843; SEQTYPE=EST; TRACE=891188752
|
||||
SEQUENCE ACC=CD464482.1; NID=g31385750; CLONE=LeukoN4_4_D01_A026; END=3'; LID=13771; SEQTYPE=EST; TRACE=891187846
|
||||
SEQUENCE ACC=CD468770.1; NID=g31390038; CLONE=LeukoS3_5_D03_A025; END=5'; LID=13775; SEQTYPE=EST; TRACE=891192280
|
||||
SEQUENCE ACC=CD470966.1; NID=g31392234; CLONE=LeukoS5_3_F05_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891193981
|
||||
SEQUENCE ACC=CD536398.1; NID=g31578813; CLONE=LeukoN6_7_C09_A028; END=5'; LID=13844; SEQTYPE=EST; TRACE=891184341
|
||||
SEQUENCE ACC=CD535162.1; NID=g31577577; CLONE=LeukoN5_1_E01_A027; END=3'; LID=13843; SEQTYPE=EST; TRACE=891188481
|
||||
SEQUENCE ACC=CD536377.1; NID=g31578792; CLONE=LeukoN6_7_C09_A028; END=3'; LID=13844; SEQTYPE=EST; TRACE=891184713
|
||||
SEQUENCE ACC=CX603705.1; NID=g57720427; CLONE=CT02036A2C09; LID=16895; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD470556.1; NID=g31391824; CLONE=LeukoS4_5_C09_A026; END=5'; LID=13777; SEQTYPE=EST; TRACE=891193234
|
||||
SEQUENCE ACC=CD466384.1; NID=g31387652; CLONE=LeukoN2_3_A06_A024; END=5'; LID=13773; SEQTYPE=EST; TRACE=891186500
|
||||
SEQUENCE ACC=CD472213.1; NID=g31393481; CLONE=LeukoS6_1_C10_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891194841
|
||||
SEQUENCE ACC=DN510913.1; NID=g60721103; CLONE=HL02021A2C09; LID=17147; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD472329.1; NID=g31393597; CLONE=LeukoS6_5_H05_A028; END=3'; LID=13779; SEQTYPE=EST; TRACE=891195397
|
||||
SEQUENCE ACC=CD536804.1; NID=g31579219; CLONE=LeukoN6_6_F12_A028; END=3'; LID=13844; SEQTYPE=EST; TRACE=891184838
|
||||
SEQUENCE ACC=CD466071.1; NID=g31387339; CLONE=LeukoN2_1_E06_A024; END=5'; LID=13773; SEQTYPE=EST; TRACE=891186571
|
||||
SEQUENCE ACC=CD471909.1; NID=g31393177; CLONE=LeukoS6_2_D05_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891195132
|
||||
SEQUENCE ACC=CD465985.1; NID=g31387253; CLONE=LeukoN2_1_E06_A024; END=3'; LID=13773; SEQTYPE=EST; TRACE=891186530
|
||||
SEQUENCE ACC=CD471244.1; NID=g31392512; CLONE=LeukoS5_1_F09_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891193761
|
||||
SEQUENCE ACC=DN508615.1; NID=g60718805; CLONE=HL02013A1F04; LID=17147; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD465146.1; NID=g31386414; CLONE=LeukoN1_2_E04_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891185503
|
||||
SEQUENCE ACC=CD465620.1; NID=g31386888; CLONE=LeukoN1_6_E09_A023; END=3'; LID=13772; SEQTYPE=EST; TRACE=891184965
|
||||
SEQUENCE ACC=CD472042.1; NID=g31393310; CLONE=LeukoS6_3_F02_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891194801
|
||||
SEQUENCE ACC=CD471148.1; NID=g31392416; CLONE=LeukoS5_4_F11_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891194177
|
||||
SEQUENCE ACC=CD466241.1; NID=g31387509; CLONE=LeukoN2_3_A06_A024; END=3'; LID=13773; SEQTYPE=EST; TRACE=891186007
|
||||
SEQUENCE ACC=CD472182.1; NID=g31393450; CLONE=LeukoS6_1_H10_A028; END=5'; LID=13779; SEQTYPE=EST; TRACE=891194593
|
||||
SEQUENCE ACC=CD470462.1; NID=g31391730; CLONE=LeukoS4_5_C12_A026; END=3'; LID=13777; SEQTYPE=EST; TRACE=891193322
|
||||
SEQUENCE ACC=CX596662.1; NID=g57706330; CLONE=CT020015A10A06; LID=16895; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD536825.1; NID=g31579240; CLONE=LeukoN6_6_F12_A028; END=5'; LID=13844; SEQTYPE=EST; TRACE=891184460
|
||||
SEQUENCE ACC=CD471373.1; NID=g31392641; CLONE=LeukoS5_5_H05_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891194564
|
||||
SEQUENCE ACC=CD470875.1; NID=g31392143; CLONE=LeukoS5_3_F05_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891193793
|
||||
SEQUENCE ACC=CD470392.1; NID=g31391660; CLONE=LeukoS4_4_G05_A026; END=3'; LID=13777; SEQTYPE=EST; TRACE=891193723
|
||||
SEQUENCE ACC=DN509023.1; NID=g60719213; CLONE=HL02014B1D10; LID=17147; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD471226.1; NID=g31392494; CLONE=LeukoS5_1_F09_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891194172
|
||||
SEQUENCE ACC=CD470547.1; NID=g31391815; CLONE=LeukoS4_5_C12_A026; END=5'; LID=13777; SEQTYPE=EST; TRACE=891193220
|
||||
SEQUENCE ACC=CX593399.1; NID=g57699803; CLONE=CT020005A10B06; LID=16895; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD465261.1; NID=g31386529; CLONE=LeukoN1_2_E04_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185735
|
||||
SEQUENCE ACC=CD470468.1; NID=g31391736; CLONE=LeukoS4_5_C09_A026; END=3'; LID=13777; SEQTYPE=EST; TRACE=891193330
|
||||
SEQUENCE ACC=CX594874.1; NID=g57702753; CLONE=CT020009B10E10; LID=16895; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD464520.1; NID=g31385788; CLONE=LeukoN4_4_D01_A026; END=5'; LID=13771; SEQTYPE=EST; TRACE=891188021
|
||||
SEQUENCE ACC=AW260814.1; NID=g6636524; END=5'; LID=2777; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD465691.1; NID=g31386959; CLONE=LeukoN1_6_E09_A023; END=5'; LID=13772; SEQTYPE=EST; TRACE=891185089
|
||||
SEQUENCE ACC=CX595434.1; NID=g57703875; CLONE=CT020011A20F11; LID=16895; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD464759.1; NID=g31386027; CLONE=LeukoN4_6_A09_A026; END=5'; LID=13771; SEQTYPE=EST; TRACE=891188174
|
||||
SEQUENCE ACC=CD469441.1; NID=g31390709; CLONE=LeukoS2_3_F03_A024; END=5'; LID=13776; SEQTYPE=EST; TRACE=946412740
|
||||
SEQUENCE ACC=CD471416.1; NID=g31392684; CLONE=LeukoS5_5_H05_A027; END=5'; LID=13778; SEQTYPE=EST; TRACE=891194460
|
||||
SEQUENCE ACC=CD536439.1; NID=g31578854; CLONE=LeukoN6_7_B04_A028; END=5'; LID=13844; SEQTYPE=EST; TRACE=891184662
|
||||
SEQUENCE ACC=CX605282.1; NID=g57723597; CLONE=CT02041A1A07; LID=16895; SEQTYPE=EST
|
||||
SEQUENCE ACC=CD464302.1; NID=g31385570; CLONE=LeukoN4_3_B01_A026; END=3'; LID=13771; SEQTYPE=EST; TRACE=891187636
|
||||
SEQUENCE ACC=CD469295.1; NID=g31390563; CLONE=LeukoS2_3_F03_A024; END=3'; LID=13776; SEQTYPE=EST; TRACE=891191446
|
||||
SEQUENCE ACC=CD470883.1; NID=g31392151; CLONE=LeukoS5_3_G08_A027; END=3'; LID=13778; SEQTYPE=EST; TRACE=891193865
|
||||
SEQUENCE ACC=CD468749.1; NID=g31390017; CLONE=LeukoS3_5_E02_A025; END=5'; LID=13775; SEQTYPE=EST; TRACE=891192160
|
||||
//
|
||||
ID Eca.2425
|
||||
TITLE Immunoglobulin-like transcript 11 protein
|
||||
GENE ILT11B
|
||||
CYTOBAND 10p12
|
||||
GENE_ID 100034238
|
||||
LOCUSLINK 100034238
|
||||
HOMOL YES
|
||||
EXPRESS blood|adult
|
||||
PROTSIM ORG=10090; PROTGI=156766061; PROTID=NP_001074708.2; PCT=50.00; ALN=281
|
||||
PROTSIM ORG=9606; PROTGI=32490553; PROTID=NP_067073.1; PCT=62.42; ALN=296
|
||||
PROTSIM ORG=13616; PROTGI=126346255; PROTID=XP_001375479.1; PCT=40.43; ALN=281
|
||||
PROTSIM ORG=9615; PROTGI=73947540; PROTID=XP_854148.1; PCT=42.11; ALN=303
|
||||
PROTSIM ORG=9913; PROTGI=194674905; PROTID=XP_001788837.1; PCT=61.11; ALN=233
|
||||
PROTSIM ORG=9258; PROTGI=149631692; PROTID=XP_001516158.1; PCT=35.65; ALN=229
|
||||
PROTSIM ORG=9796; PROTGI=146262003; PROTID=NP_001075996.1; PCT=100.00; ALN=279
|
||||
PROTSIM ORG=10116; PROTGI=109461031; PROTID=XP_001065532.1; PCT=52.77; ALN=269
|
||||
PROTSIM ORG=9544; PROTGI=100818611; PROTID=NP_001035767.1; PCT=57.35; ALN=277
|
||||
PROTSIM ORG=9823; PROTGI=178057314; PROTID=NP_001116615.1; PCT=43.01; ALN=278
|
||||
PROTSIM ORG=9598; PROTGI=58801536; PROTID=NP_001009045.1; PCT=54.51; ALN=276
|
||||
SCOUNT 9
|
||||
SEQUENCE ACC=AB120409.1; NID=g45597282; PID=g45597283; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=NM_001082527.1; NID=g146262002; PID=g146262003; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=AB120410.1; NID=g45597284; PID=g45597285; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=AB120411.1; NID=g45597286; PID=g45597287; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=CD467202.1; NID=g31388470; CLONE=LeukoS1_2_D11_A023; END=5'; LID=13774; SEQTYPE=EST; TRACE=891190473
|
||||
SEQUENCE ACC=CD535974.1; NID=g31578389; CLONE=LeukoN6_2_F07_A028; END=3'; LID=13844; SEQTYPE=EST; TRACE=891184067
|
||||
SEQUENCE ACC=CD469544.1; NID=g31390812; CLONE=LeukoS2_4_D05_A024; END=5'; LID=13776; SEQTYPE=EST; TRACE=891191700
|
||||
SEQUENCE ACC=CD528742.1; NID=g31567364; CLONE=LeukoN3_7_F08_A025; END=3'; LID=13842; SEQTYPE=EST; TRACE=891187302
|
||||
SEQUENCE ACC=CD469487.1; NID=g31390755; CLONE=LeukoS2_4_D05_A024; END=3'; LID=13776; SEQTYPE=EST; TRACE=891191801
|
||||
//
|
70
Tests/UniGene/Hs.2.data
Normal file
70
Tests/UniGene/Hs.2.data
Normal file
@ -0,0 +1,70 @@
|
||||
ID Hs.2
|
||||
TITLE N-acetyltransferase 2 (arylamine N-acetyltransferase)
|
||||
GENE NAT2
|
||||
CYTOBAND 8p22
|
||||
GENE_ID 10
|
||||
LOCUSLINK 10
|
||||
HOMOL YES
|
||||
EXPRESS bone| connective tissue| intestine| liver| liver tumor| normal| soft tissue/muscle tissue tumor| adult
|
||||
RESTR_EXPR adult
|
||||
CHROMOSOME 8
|
||||
STS ACC=PMC310725P3 UNISTS=272646
|
||||
STS ACC=WIAF-2120 UNISTS=44576
|
||||
STS ACC=G59899 UNISTS=137181
|
||||
STS ACC=G06461 UNISTS=17088
|
||||
STS ACC=GDB:310612 UNISTS=156422
|
||||
STS ACC=GDB:310613 UNISTS=156423
|
||||
STS ACC=GDB:187676 UNISTS=155563
|
||||
PROTSIM ORG=10090; PROTGI=6754794; PROTID=NP_035004.1; PCT=76.55; ALN=288
|
||||
PROTSIM ORG=9796; PROTGI=149742490; PROTID=XP_001487907.1; PCT=79.66; ALN=288
|
||||
PROTSIM ORG=9986; PROTGI=126722851; PROTID=NP_001075655.1; PCT=76.90; ALN=288
|
||||
PROTSIM ORG=9544; PROTGI=113461974; PROTID=NP_001038201.1; PCT=93.10; ALN=288
|
||||
PROTSIM ORG=9606; PROTGI=116295260; PROTID=NP_000006.2; PCT=100.00; ALN=288
|
||||
PROTSIM ORG=7719; PROTGI=198423589; PROTID=XP_002126320.1; PCT=29.46; ALN=240
|
||||
PROTSIM ORG=10116; PROTGI=16758720; PROTID=NP_446306.1; PCT=75.52; ALN=288
|
||||
PROTSIM ORG=13616; PROTGI=126303190; PROTID=XP_001371814.1; PCT=63.67; ALN=287
|
||||
PROTSIM ORG=9031; PROTGI=45384408; PROTID=NP_990671.1; PCT=57.93; ALN=288
|
||||
PROTSIM ORG=9913; PROTGI=115497526; PROTID=NP_001069040.1; PCT=80.34; ALN=288
|
||||
PROTSIM ORG=9258; PROTGI=149640224; PROTID=XP_001508302.1; PCT=62.37; ALN=278
|
||||
PROTSIM ORG=7955; PROTGI=125821897; PROTID=XP_001334322.1; PCT=43.21; ALN=279
|
||||
PROTSIM ORG=9598; PROTGI=114619004; PROTID=XP_519631.2; PCT=98.28; ALN=288
|
||||
SCOUNT 38
|
||||
SEQUENCE ACC=BC067218.1; NID=g45501306; PID=g45501307; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=NM_000015.2; NID=g116295259; PID=g116295260; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=D90042.1; NID=g219415; PID=g219416; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=D90040.1; NID=g219411; PID=g219412; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=BC015878.1; NID=g16198419; PID=g16198420; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=CR407631.1; NID=g47115198; PID=g47115199; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=BG569293.1; NID=g13576946; CLONE=IMAGE:4722596; END=5'; LID=6989; SEQTYPE=EST; TRACE=44157214
|
||||
SEQUENCE ACC=AI792606.1; NID=g5340322; CLONE=IMAGE:1870937; END=5'; LID=1079; SEQTYPE=EST
|
||||
SEQUENCE ACC=BG568794.1; NID=g13576447; CLONE=IMAGE:4716636; END=5'; LID=6989; SEQTYPE=EST; TRACE=44156817
|
||||
SEQUENCE ACC=BG533459.1; NID=g13524999; CLONE=IMAGE:4072143; END=5'; LID=6989; SEQTYPE=EST; TRACE=44404609
|
||||
SEQUENCE ACC=BG568400.1; NID=g13576053; CLONE=IMAGE:4716802; END=5'; LID=6989; SEQTYPE=EST; TRACE=44156561
|
||||
SEQUENCE ACC=BG618195.1; NID=g13669566; CLONE=IMAGE:4767316; END=5'; LID=6989; SEQTYPE=EST; TRACE=45338366
|
||||
SEQUENCE ACC=BG563731.1; NID=g13571383; CLONE=IMAGE:4712210; END=5'; LID=6989; SEQTYPE=EST; TRACE=44153506
|
||||
SEQUENCE ACC=AI733799.1; NID=g5054912; CLONE=IMAGE:1870937; END=3'; LID=1079; SEQTYPE=EST
|
||||
SEQUENCE ACC=BG569272.1; NID=g13576925; CLONE=IMAGE:4722638; END=5'; LID=6989; SEQTYPE=EST; TRACE=44157191
|
||||
SEQUENCE ACC=AJ581147.1; NID=g73759744; PID=g73759745; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=BU624903.1; NID=g23291118; CLONE=UI-H-FG1-bgl-g-02-0-UI; END=3'; LID=11914; SEQTYPE=EST; TRACE=159705553
|
||||
SEQUENCE ACC=BG617259.1; NID=g13668630; CLONE=IMAGE:4734378; END=5'; LID=6989; SEQTYPE=EST; TRACE=44229423
|
||||
SEQUENCE ACC=CV029049.1; NID=g51487181; END=5'; LID=16264; SEQTYPE=EST
|
||||
SEQUENCE ACC=BP264356.1; NID=g52179587; CLONE=HSI15615; END=5'; LID=16400; SEQTYPE=EST
|
||||
SEQUENCE ACC=BP262043.1; NID=g52177274; CLONE=HSI05750; END=5'; LID=16400; SEQTYPE=EST
|
||||
SEQUENCE ACC=BP262787.1; NID=g52178018; CLONE=HSI08034; END=5'; LID=16400; SEQTYPE=EST
|
||||
SEQUENCE ACC=CB161982.1; NID=g28148108; CLONE=L17N670205n1-15-F12; END=5'; LID=12542; SEQTYPE=EST
|
||||
SEQUENCE ACC=BX095770.1; NID=g27827877; CLONE=IMAGp998I184581_,_IMAGE:1870937; LID=1079; SEQTYPE=EST
|
||||
SEQUENCE ACC=AI262683.1; NID=g3870886; CLONE=IMAGE:1870937; END=3'; LID=1079; SEQTYPE=EST
|
||||
SEQUENCE ACC=AI460128.1; NID=g4313009; CLONE=IMAGE:2151449; END=3'; LID=1556; SEQTYPE=EST
|
||||
SEQUENCE ACC=CB161860.1; NID=g28147986; CLONE=L17N670205n1-41-A04; END=5'; LID=12542; SEQTYPE=EST
|
||||
SEQUENCE ACC=AJ581145.1; NID=g73759740; PID=g73759741; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=EG327405.1; NID=g116004924; CLONE=MGC12part.1.3.L1.1.D01.; LID=17261; SEQTYPE=EST
|
||||
SEQUENCE ACC=AJ581144.1; NID=g73759738; PID=g73759739; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=EG327340.1; NID=g116004859; CLONE=MGC12part.1.2.L1.1.D01.; LID=17261; SEQTYPE=EST
|
||||
SEQUENCE ACC=AV658623.1; NID=g9879637; CLONE=GLCFOD10; END=3'; LID=5601; SEQTYPE=EST
|
||||
SEQUENCE ACC=AV658656.1; NID=g9879670; CLONE=GLCFOG07; END=3'; LID=5601; SEQTYPE=EST
|
||||
SEQUENCE ACC=AV684197.2; NID=g55943471; CLONE=GKCFZH06; END=5'; LID=6533; SEQTYPE=EST
|
||||
SEQUENCE ACC=AJ581146.1; NID=g73759742; PID=g73759743; SEQTYPE=mRNA
|
||||
SEQUENCE ACC=EG327618.1; NID=g116005137; CLONE=MGC11part.2.3.L1.1.E06.; LID=17261; SEQTYPE=EST
|
||||
SEQUENCE ACC=BG204539.1; NID=g13726226; LID=8655; SEQTYPE=EST
|
||||
SEQUENCE ACC=AU099534.1; NID=g13550663; CLONE=HSI08034; END=5'; LID=8800; SEQTYPE=EST
|
||||
//
|
File diff suppressed because it is too large
Load Diff
34
Tests/test_UniGene_obsolete.py
Normal file
34
Tests/test_UniGene_obsolete.py
Normal file
@ -0,0 +1,34 @@
|
||||
from Bio import UniGene
|
||||
|
||||
#Start of the UniGene file for Monodelphis domestica downloaded from:
|
||||
#ftp://ftp.ncbi.nih.gov/repository/UniGene/Monodelphis_domestica
|
||||
handle = open("UniGene/Mdm_partial.data")
|
||||
|
||||
ugparser = UniGene.Iterator(handle, UniGene.RecordParser())
|
||||
for record in ugparser:
|
||||
assert isinstance(record.ID, str)
|
||||
assert isinstance(record.title, str)
|
||||
assert isinstance(record.species, str)
|
||||
assert isinstance(record.express, list)
|
||||
assert isinstance(record.sequence, list)
|
||||
|
||||
print record.ID
|
||||
print "Title: '%s'" % record.title
|
||||
print "Expressed:", record.express
|
||||
print "Chromosome:", record.chromosome
|
||||
if record.sequence :
|
||||
print "Sequences:"
|
||||
for s in record.sequence :
|
||||
assert isinstance(s, UniGene.UnigeneSequenceRecord)
|
||||
print s
|
||||
else :
|
||||
print "No sequences"
|
||||
|
||||
assert record.species == "Mdm"
|
||||
#Should be no PROTSIM lines in this file!
|
||||
assert isinstance(record.protsim, list)
|
||||
assert len(record.protsim) == 0
|
||||
|
||||
print
|
||||
print "Done"
|
||||
handle.close()
|
Reference in New Issue
Block a user