Uniprotxml parse ligand tag (#5001)

* enable handling of <ligand> tags in UniProt XML files

* test parsing of <ligand> in UniProt XML

* add name to CONTRIB.rst

* format with black

* enable capture of multiple ligands from uniprot xml file. update tests accordingly

* add mock UniProt XML file with multiple ligand tags per feature

* test for correct len of all ligands list and multiple ligands per feature
This commit is contained in:
Ira Horecka
2025-05-30 16:09:26 -07:00
committed by GitHub
parent 91740a7788
commit 1172869397
5 changed files with 2402 additions and 7 deletions

View File

@ -16,19 +16,15 @@ originally introduced by SwissProt ("swiss" format in Bio.SeqIO).
""" """
import warnings
from xml.etree import ElementTree from xml.etree import ElementTree
from xml.parsers.expat import errors from xml.parsers.expat import errors
import warnings
from Bio import SeqFeature from Bio import BiopythonDeprecationWarning, SeqFeature
from Bio.Seq import Seq from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
from Bio import BiopythonDeprecationWarning from .Interfaces import SequenceIterator, _BytesIOSource
from .Interfaces import _BytesIOSource
from .Interfaces import SequenceIterator
NS = "{http://uniprot.org/uniprot}" NS = "{http://uniprot.org/uniprot}"
REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)"
@ -472,6 +468,19 @@ class UniprotIterator(SequenceIterator):
feature.location = SeqFeature.SimpleLocation( feature.location = SeqFeature.SimpleLocation(
start_position, end_position start_position, end_position
) )
elif feature_element.tag == NS + "ligand":
# Support multiple ligand entries per feature
name = None
db_ref = None
for child in feature_element:
if child.tag == NS + "name":
name = child.text.strip() if child.text else None
elif child.tag == NS + "dbReference":
db_ref = child.attrib.get("id")
# Append to a list of ligands in qualifiers
lig_list = feature.qualifiers.setdefault("ligands", [])
lig_list.append({"name": name, "db_ref": db_ref})
continue
else: else:
try: try:
feature.qualifiers[feature_element.tag.replace(NS, "")] = ( feature.qualifiers[feature_element.tag.replace(NS, "")] = (

View File

@ -155,6 +155,7 @@ please open an issue on GitHub or mention it on the mailing list.
- Iddo Friedberg <https://github.com/idoerg> - Iddo Friedberg <https://github.com/idoerg>
- Igor S. Gerasimov <https://github.com/foxtran> - Igor S. Gerasimov <https://github.com/foxtran>
- Ilya Flyamer <https://github.com/Phlya> - Ilya Flyamer <https://github.com/Phlya>
- Ira Horecka <https://github.com/irahorecka>
- Isaac Ellmen <https://github.com/Ellmen> - Isaac Ellmen <https://github.com/Ellmen>
- Ivan Antonov <https://github.com/vanya-antonov> - Ivan Antonov <https://github.com/vanya-antonov>
- Jacek Śmietański <https://github.com/dadoskawina> - Jacek Śmietański <https://github.com/dadoskawina>

2343
Tests/SwissProt/P62330.xml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
<uniprot xmlns="http://uniprot.org/uniprot">
<entry dataset="Swiss-Prot" created="2024-01-01" modified="2025-01-01" version="1">
<name>TEST_PROTEIN</name>
<accession>DUMMY123</accession>
<sequence length="1" mass="100" version="1">A</sequence>
<feature type="binding site">
<location><position position="1"/></location>
<ligand>
<name>ATP</name>
<dbReference type="ChEBI" id="CHEBI:15422"/>
</ligand>
<ligand>
<name>ADP</name>
<dbReference type="ChEBI" id="CHEBI:16761"/>
</ligand>
</feature>
</entry>
</uniprot>

View File

@ -498,6 +498,30 @@ class ParserTests(SeqRecordTestBaseClass):
# test Entry version # test Entry version
self.assertEqual(seq_record.annotations["entry_version"], 158) self.assertEqual(seq_record.annotations["entry_version"], 158)
def test_P62330_ligand(self):
"""Test parsing of <ligand> in UniProt XML (P62330)."""
record = SeqIO.read("SwissProt/P62330.xml", "uniprot-xml")
all_ligands = []
for f in record.features:
all_ligands.extend(f.qualifiers.get("ligands", []))
self.assertEqual(len(all_ligands), 5)
self.assertEqual(all_ligands[0]["name"], "GTP")
self.assertEqual(all_ligands[0]["db_ref"], "CHEBI:37565")
def test_multiligand_binding_site(self):
"""Test parsing of binding site with multiple ligands in UniProt XML."""
record = SeqIO.read("SwissProt/multiligand.xml", "uniprot-xml")
sites = [
f
for f in record.features
if f.type == "binding site" and "ligands" in f.qualifiers
]
self.assertTrue(sites, "No binding site with ligands found")
self.assertEqual(len(sites[0].qualifiers["ligands"]), 2)
ligand_names = {lig["name"] for lig in sites[0].qualifiers["ligands"]}
self.assertIn("ATP", ligand_names)
self.assertIn("ADP", ligand_names)
def compare_txt_xml(self, old, new): def compare_txt_xml(self, old, new):
"""Compare text and XML based parser output.""" """Compare text and XML based parser output."""
self.assertEqual(old.id, new.id) self.assertEqual(old.id, new.id)