mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
UniProt GOA Added parser for GPI version 1.2 (#3290)
Added parser for UniProt GOA Gene Product Information (GPI) version 1.2, including sample and test.
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2013, 2016 by Iddo Friedberg idoerg@gmail.com
|
||||
# All rights reserved.
|
||||
# Copyright 2013, 2016 by Iddo Friedberg idoerg@gmail.com. All rights reserved.
|
||||
# Copyright 2020 by Sergio Valqui. All rights reserved.
|
||||
#
|
||||
# This file is part of the Biopython distribution and governed by your
|
||||
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
||||
# Please see the LICENSE file that should have been included as part of this
|
||||
@ -10,15 +11,18 @@
|
||||
Uniprot-GOA README + GAF format description:
|
||||
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README
|
||||
|
||||
GAF formats:
|
||||
Gene Association File, GAF formats:
|
||||
http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/
|
||||
http://geneontology.org/docs/go-annotation-file-gaf-format-2.0/
|
||||
|
||||
gp_association (GPA format) README:
|
||||
Gene Product Association Data (GPA format) README:
|
||||
http://geneontology.org/docs/gene-product-association-data-gpad-format/
|
||||
|
||||
gp_information (GPI format) README:
|
||||
Gene Product Information (GPI format) README:
|
||||
http://geneontology.org/docs/gene-product-information-gpi-format/
|
||||
|
||||
Go Annotation files are located here:
|
||||
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
|
||||
"""
|
||||
|
||||
|
||||
@ -127,6 +131,20 @@ GPI11FIELDS = [
|
||||
"Gene_Product_Properties",
|
||||
]
|
||||
|
||||
# GPI version 1.2
|
||||
GPI12FIELDS = [
|
||||
"DB",
|
||||
"DB_Object_ID",
|
||||
"DB_Object_Symbol",
|
||||
"DB_Object_Name",
|
||||
"DB_Object_Synonym",
|
||||
"DB_Object_Type",
|
||||
"Taxon",
|
||||
"Parent_Object_ID",
|
||||
"DB_Xref",
|
||||
"Gene_Product_Properties",
|
||||
]
|
||||
|
||||
|
||||
def _gpi10iterator(handle):
|
||||
"""Read GPI 1.0 format files (PRIVATE).
|
||||
@ -146,10 +164,10 @@ def _gpi10iterator(handle):
|
||||
|
||||
|
||||
def _gpi11iterator(handle):
|
||||
"""Read GPI 1.0 format files (PRIVATE).
|
||||
"""Read GPI 1.1 format files (PRIVATE).
|
||||
|
||||
This iterator is used to read a gp_information.goa_uniprot
|
||||
file which is in the GPI 1.0 format.
|
||||
file which is in the GPI 1.1 format.
|
||||
"""
|
||||
for inline in handle:
|
||||
if inline[0] == "!":
|
||||
@ -164,6 +182,25 @@ def _gpi11iterator(handle):
|
||||
yield dict(zip(GPI11FIELDS, inrec))
|
||||
|
||||
|
||||
def _gpi12iterator(handle):
|
||||
"""Read GPI 1.2 format files (PRIVATE).
|
||||
|
||||
This iterator is used to read a gp_information.goa_uniprot
|
||||
file which is in the GPI 1.2 format.
|
||||
"""
|
||||
for inline in handle:
|
||||
if inline[0] == "!":
|
||||
continue
|
||||
inrec = inline.rstrip("\n").split("\t")
|
||||
if len(inrec) == 1:
|
||||
continue
|
||||
inrec[3] = inrec[3].split("|") # DB_Object_Name
|
||||
inrec[4] = inrec[4].split("|") # DB_Object_Synonym(s)
|
||||
inrec[8] = inrec[8].split("|") # DB_Xref(s)
|
||||
inrec[9] = inrec[9].split("|") # Properties
|
||||
yield dict(zip(GPI12FIELDS, inrec))
|
||||
|
||||
|
||||
def gpi_iterator(handle):
|
||||
"""Read GPI format files.
|
||||
|
||||
@ -173,7 +210,9 @@ def gpi_iterator(handle):
|
||||
this function is a placeholder a future wrapper.
|
||||
"""
|
||||
inline = handle.readline()
|
||||
if inline.strip() == "!gpi-version: 1.1":
|
||||
if inline.strip() == "!gpi-version: 1.2":
|
||||
return _gpi12iterator(handle)
|
||||
elif inline.strip() == "!gpi-version: 1.1":
|
||||
# sys.stderr.write("gpi 1.1\n")
|
||||
return _gpi11iterator(handle)
|
||||
elif inline.strip() == "!gpi-version: 1.0":
|
||||
|
6
NEWS.rst
6
NEWS.rst
@ -125,6 +125,11 @@ Parsing motifs in ``minimal``` MEME format will use ``nsites`` when making
|
||||
the count matrix from the frequency matrix, instead of multiply the frequency
|
||||
matrix by 1000000.
|
||||
|
||||
Bio.UniProt.GOA now parses Gene Product Information (GPI) files version 1.2,
|
||||
files can be downloaded from the EBI ftp site:
|
||||
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
|
||||
|
||||
|
||||
Many thanks to the Biopython developers and community for making this release
|
||||
possible, especially the following contributors:
|
||||
|
||||
@ -132,6 +137,7 @@ possible, especially the following contributors:
|
||||
- Gert Hulselmans
|
||||
- João Rodrigues
|
||||
- Markus Piotrowski
|
||||
- Sergio Valqui
|
||||
- Suyash Gupta
|
||||
- Vini Salazar (first contribution)
|
||||
- Leighton Pritchard
|
||||
|
34
Tests/UniProt/goa_human_sample.gpi
Normal file
34
Tests/UniProt/goa_human_sample.gpi
Normal file
@ -0,0 +1,34 @@
|
||||
!gpi-version: 1.2
|
||||
!
|
||||
!This file has been trimmed, to include only few records, for testing Biopython GOA.py module.
|
||||
! Original file can be found here ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/HUMAN/
|
||||
!
|
||||
!The set of protein accessions included in this file is based on UniProt reference proteomes, which provide one protein per gene.
|
||||
!They include the protein sequences annotated in Swiss-Prot or the longest TrEMBL transcript if there is no Swiss-Prot record.
|
||||
!Protein accessions are represented in this file even if there is no associated GO annotation.
|
||||
!
|
||||
!Columns:
|
||||
!
|
||||
! name required? cardinality GAF column # Example content
|
||||
! DB required 1 1 UniProtKB
|
||||
! DB_Object_ID required 1 2/17 Q4VCS5-1
|
||||
! DB_Object_Symbol required 1 3 AMOT
|
||||
! DB_Object_Name optional 0 or greater 10 Angiomotin
|
||||
! DB_Object_Synonym(s) optional 0 or greater 11 AMOT|KIAA1071
|
||||
! DB_Object_Type required 1 12 protein
|
||||
! Taxon required 1 13 taxon:9606
|
||||
! Parent_Object_ID optional 0 or 1 - UniProtKB:Q4VCS5
|
||||
! DB_Xref(s) optional 0 or greater - HGNC:17810
|
||||
! Properties optional 0 or greater - db_subset=Swiss-Prot
|
||||
!
|
||||
!Generated: 2019-12-17 09:36
|
||||
!
|
||||
UniProtKB A0A024R1R8 hCG_2014768 HCG2014768, isoform CRA_a hCG_2014768 protein taxon:9606 db_subset=TrEMBL
|
||||
UniProtKB P01889 HLA-B HLA class I histocompatibility antigen, B alpha chain HLA-B|HLAB protein taxon:9606 HGNC:4932 db_subset=Swiss-Prot
|
||||
UniProtKB P01893 HLA-H Putative HLA class I histocompatibility antigen, alpha chain H HLA-H|HLAH protein taxon:9606 HGNC:4965 db_subset=Swiss-Prot
|
||||
UniProtKB P01903 HLA-DRA HLA class II histocompatibility antigen, DR alpha chain HLA-DRA|HLA-DRA1 protein taxon:9606 HGNC:4947 db_subset=Swiss-Prot
|
||||
UniProtKB P01906 HLA-DQA2 HLA class II histocompatibility antigen, DQ alpha 2 chain HLA-DQA2|HLA-DXA protein taxon:9606 HGNC:4943 db_subset=Swiss-Prot
|
||||
UniProtKB P01909 HLA-DQA1 HLA class II histocompatibility antigen, DQ alpha 1 chain HLA-DQA1 protein taxon:9606 HGNC:4942 db_subset=Swiss-Prot
|
||||
UniProtKB P01911 HLA-DRB1 HLA class II histocompatibility antigen, DRB1 beta chain HLA-DRB1 protein taxon:9606 HGNC:4948 db_subset=Swiss-Prot
|
||||
UniProtKB P01920 HLA-DQB1 HLA class II histocompatibility antigen, DQ beta 1 chain HLA-DQB1|HLA-DQB protein taxon:9606 HGNC:4944 db_subset=Swiss-Prot
|
||||
UniProtKB P02008 HBZ Hemoglobin subunit zeta HBZ|HBZ2 protein taxon:9606 HGNC:4835 db_subset=Swiss-Prot
|
@ -1,4 +1,5 @@
|
||||
# Copyright 2019 by Sergio Valqui. All rights reserved.
|
||||
# Copyright 2019-2020 by Sergio Valqui. All rights reserved.
|
||||
#
|
||||
# This file is part of the Biopython distribution and governed by your
|
||||
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
||||
# Please see the LICENSE file that should have been included as part of this
|
||||
@ -82,7 +83,7 @@ class GoaTests(unittest.TestCase):
|
||||
self.assertEqual(recs[0]["Annotation_Properties"], "go_evidence=ND")
|
||||
|
||||
def test_gpi_iterator(self):
|
||||
"""Test GOA GPI file iterator."""
|
||||
"""Test GOA GPI file iterator, gpi-version: 1.1."""
|
||||
recs = []
|
||||
with open("UniProt/gp_information.goa_yeast.28.gpi") as handle:
|
||||
for rec in GOA.gpi_iterator(handle):
|
||||
@ -107,6 +108,28 @@ class GoaTests(unittest.TestCase):
|
||||
self.assertEqual(recs[0]["DB_Xref"], [""])
|
||||
self.assertEqual(recs[0]["Gene_Product_Properties"], ["db_subset=Swiss-Prot"])
|
||||
|
||||
def test_gpi_iterator_one_two(self):
|
||||
"""Test GOA GPI file iterator, gpi-version: 1.2."""
|
||||
recs = []
|
||||
with open("UniProt/goa_human_sample.gpi") as handle:
|
||||
for rec in GOA.gpi_iterator(handle):
|
||||
recs.append(rec)
|
||||
self.assertEqual(len(recs), 9)
|
||||
self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI12FIELDS))
|
||||
# Check values of first record
|
||||
self.assertEqual(recs[0]["DB"], "UniProtKB")
|
||||
self.assertEqual(recs[0]["DB_Object_ID"], "A0A024R1R8")
|
||||
self.assertEqual(recs[0]["DB_Object_Symbol"], "hCG_2014768")
|
||||
self.assertEqual(
|
||||
recs[0]["DB_Object_Name"], ["HCG2014768, isoform CRA_a"],
|
||||
)
|
||||
self.assertEqual(recs[0]["DB_Object_Synonym"], ["hCG_2014768"])
|
||||
self.assertEqual(recs[0]["DB_Object_Type"], "protein")
|
||||
self.assertEqual(recs[0]["Taxon"], "taxon:9606")
|
||||
self.assertEqual(recs[0]["Parent_Object_ID"], "")
|
||||
self.assertEqual(recs[0]["DB_Xref"], [""])
|
||||
self.assertEqual(recs[0]["Gene_Product_Properties"], ["db_subset=TrEMBL"])
|
||||
|
||||
def test_selection_writing(self):
|
||||
"""Test record_has, and writerec.
|
||||
|
||||
|
Reference in New Issue
Block a user