UniProt GOA Added parser for GPI version 1.2 (#3290)

Added parser for UniProt GOA Gene Product Information (GPI) version 1.2, including sample and test.
This commit is contained in:
Sergio
2021-03-23 20:28:27 +11:00
committed by GitHub
parent 0e6abf6bf3
commit c945dedc88
4 changed files with 112 additions and 10 deletions

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
# Copyright 2013, 2016 by Iddo Friedberg idoerg@gmail.com
# All rights reserved.
# Copyright 2013, 2016 by Iddo Friedberg idoerg@gmail.com. All rights reserved.
# Copyright 2020 by Sergio Valqui. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
@ -10,15 +11,18 @@
Uniprot-GOA README + GAF format description:
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README
GAF formats:
Gene Association File, GAF formats:
http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/
http://geneontology.org/docs/go-annotation-file-gaf-format-2.0/
gp_association (GPA format) README:
Gene Product Association Data (GPA format) README:
http://geneontology.org/docs/gene-product-association-data-gpad-format/
gp_information (GPI format) README:
Gene Product Information (GPI format) README:
http://geneontology.org/docs/gene-product-information-gpi-format/
Go Annotation files are located here:
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
"""
@ -127,6 +131,20 @@ GPI11FIELDS = [
"Gene_Product_Properties",
]
# GPI version 1.2
GPI12FIELDS = [
"DB",
"DB_Object_ID",
"DB_Object_Symbol",
"DB_Object_Name",
"DB_Object_Synonym",
"DB_Object_Type",
"Taxon",
"Parent_Object_ID",
"DB_Xref",
"Gene_Product_Properties",
]
def _gpi10iterator(handle):
"""Read GPI 1.0 format files (PRIVATE).
@ -146,10 +164,10 @@ def _gpi10iterator(handle):
def _gpi11iterator(handle):
"""Read GPI 1.0 format files (PRIVATE).
"""Read GPI 1.1 format files (PRIVATE).
This iterator is used to read a gp_information.goa_uniprot
file which is in the GPI 1.0 format.
file which is in the GPI 1.1 format.
"""
for inline in handle:
if inline[0] == "!":
@ -164,6 +182,25 @@ def _gpi11iterator(handle):
yield dict(zip(GPI11FIELDS, inrec))
def _gpi12iterator(handle):
"""Read GPI 1.2 format files (PRIVATE).
This iterator is used to read a gp_information.goa_uniprot
file which is in the GPI 1.2 format.
"""
for inline in handle:
if inline[0] == "!":
continue
inrec = inline.rstrip("\n").split("\t")
if len(inrec) == 1:
continue
inrec[3] = inrec[3].split("|") # DB_Object_Name
inrec[4] = inrec[4].split("|") # DB_Object_Synonym(s)
inrec[8] = inrec[8].split("|") # DB_Xref(s)
inrec[9] = inrec[9].split("|") # Properties
yield dict(zip(GPI12FIELDS, inrec))
def gpi_iterator(handle):
"""Read GPI format files.
@ -173,7 +210,9 @@ def gpi_iterator(handle):
this function is a placeholder a future wrapper.
"""
inline = handle.readline()
if inline.strip() == "!gpi-version: 1.1":
if inline.strip() == "!gpi-version: 1.2":
return _gpi12iterator(handle)
elif inline.strip() == "!gpi-version: 1.1":
# sys.stderr.write("gpi 1.1\n")
return _gpi11iterator(handle)
elif inline.strip() == "!gpi-version: 1.0":

View File

@ -125,6 +125,11 @@ Parsing motifs in ``minimal``` MEME format will use ``nsites`` when making
the count matrix from the frequency matrix, instead of multiply the frequency
matrix by 1000000.
Bio.UniProt.GOA now parses Gene Product Information (GPI) files version 1.2,
files can be downloaded from the EBI ftp site:
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
Many thanks to the Biopython developers and community for making this release
possible, especially the following contributors:
@ -132,6 +137,7 @@ possible, especially the following contributors:
- Gert Hulselmans
- João Rodrigues
- Markus Piotrowski
- Sergio Valqui
- Suyash Gupta
- Vini Salazar (first contribution)
- Leighton Pritchard

View File

@ -0,0 +1,34 @@
!gpi-version: 1.2
!
!This file has been trimmed, to include only few records, for testing Biopython GOA.py module.
! Original file can be found here ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/HUMAN/
!
!The set of protein accessions included in this file is based on UniProt reference proteomes, which provide one protein per gene.
!They include the protein sequences annotated in Swiss-Prot or the longest TrEMBL transcript if there is no Swiss-Prot record.
!Protein accessions are represented in this file even if there is no associated GO annotation.
!
!Columns:
!
! name required? cardinality GAF column # Example content
! DB required 1 1 UniProtKB
! DB_Object_ID required 1 2/17 Q4VCS5-1
! DB_Object_Symbol required 1 3 AMOT
! DB_Object_Name optional 0 or greater 10 Angiomotin
! DB_Object_Synonym(s) optional 0 or greater 11 AMOT|KIAA1071
! DB_Object_Type required 1 12 protein
! Taxon required 1 13 taxon:9606
! Parent_Object_ID optional 0 or 1 - UniProtKB:Q4VCS5
! DB_Xref(s) optional 0 or greater - HGNC:17810
! Properties optional 0 or greater - db_subset=Swiss-Prot
!
!Generated: 2019-12-17 09:36
!
UniProtKB A0A024R1R8 hCG_2014768 HCG2014768, isoform CRA_a hCG_2014768 protein taxon:9606 db_subset=TrEMBL
UniProtKB P01889 HLA-B HLA class I histocompatibility antigen, B alpha chain HLA-B|HLAB protein taxon:9606 HGNC:4932 db_subset=Swiss-Prot
UniProtKB P01893 HLA-H Putative HLA class I histocompatibility antigen, alpha chain H HLA-H|HLAH protein taxon:9606 HGNC:4965 db_subset=Swiss-Prot
UniProtKB P01903 HLA-DRA HLA class II histocompatibility antigen, DR alpha chain HLA-DRA|HLA-DRA1 protein taxon:9606 HGNC:4947 db_subset=Swiss-Prot
UniProtKB P01906 HLA-DQA2 HLA class II histocompatibility antigen, DQ alpha 2 chain HLA-DQA2|HLA-DXA protein taxon:9606 HGNC:4943 db_subset=Swiss-Prot
UniProtKB P01909 HLA-DQA1 HLA class II histocompatibility antigen, DQ alpha 1 chain HLA-DQA1 protein taxon:9606 HGNC:4942 db_subset=Swiss-Prot
UniProtKB P01911 HLA-DRB1 HLA class II histocompatibility antigen, DRB1 beta chain HLA-DRB1 protein taxon:9606 HGNC:4948 db_subset=Swiss-Prot
UniProtKB P01920 HLA-DQB1 HLA class II histocompatibility antigen, DQ beta 1 chain HLA-DQB1|HLA-DQB protein taxon:9606 HGNC:4944 db_subset=Swiss-Prot
UniProtKB P02008 HBZ Hemoglobin subunit zeta HBZ|HBZ2 protein taxon:9606 HGNC:4835 db_subset=Swiss-Prot

View File

@ -1,4 +1,5 @@
# Copyright 2019 by Sergio Valqui. All rights reserved.
# Copyright 2019-2020 by Sergio Valqui. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
@ -82,7 +83,7 @@ class GoaTests(unittest.TestCase):
self.assertEqual(recs[0]["Annotation_Properties"], "go_evidence=ND")
def test_gpi_iterator(self):
"""Test GOA GPI file iterator."""
"""Test GOA GPI file iterator, gpi-version: 1.1."""
recs = []
with open("UniProt/gp_information.goa_yeast.28.gpi") as handle:
for rec in GOA.gpi_iterator(handle):
@ -107,6 +108,28 @@ class GoaTests(unittest.TestCase):
self.assertEqual(recs[0]["DB_Xref"], [""])
self.assertEqual(recs[0]["Gene_Product_Properties"], ["db_subset=Swiss-Prot"])
def test_gpi_iterator_one_two(self):
"""Test GOA GPI file iterator, gpi-version: 1.2."""
recs = []
with open("UniProt/goa_human_sample.gpi") as handle:
for rec in GOA.gpi_iterator(handle):
recs.append(rec)
self.assertEqual(len(recs), 9)
self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI12FIELDS))
# Check values of first record
self.assertEqual(recs[0]["DB"], "UniProtKB")
self.assertEqual(recs[0]["DB_Object_ID"], "A0A024R1R8")
self.assertEqual(recs[0]["DB_Object_Symbol"], "hCG_2014768")
self.assertEqual(
recs[0]["DB_Object_Name"], ["HCG2014768, isoform CRA_a"],
)
self.assertEqual(recs[0]["DB_Object_Synonym"], ["hCG_2014768"])
self.assertEqual(recs[0]["DB_Object_Type"], "protein")
self.assertEqual(recs[0]["Taxon"], "taxon:9606")
self.assertEqual(recs[0]["Parent_Object_ID"], "")
self.assertEqual(recs[0]["DB_Xref"], [""])
self.assertEqual(recs[0]["Gene_Product_Properties"], ["db_subset=TrEMBL"])
def test_selection_writing(self):
"""Test record_has, and writerec.