Add BinaryCIF parser (#4707)

Also fixes a structure comparison bug
2025-10-20 21:53:47 +08:00 · 2024-06-11 03:35:46 +00:00
parent 5d3189e689
commit 4fe6640e46
12 changed files with 579 additions and 7 deletions
--- a/Bio/PDB/Atom.py
+++ b/Bio/PDB/Atom.py
@ -290,13 +290,11 @@ class Atom:
        return (
            self.name == other.name
-            and self.bfactor == other.bfactor
+            and np.isclose(self.bfactor, other.bfactor)
-            and self.occupancy == other.occupancy
+            and np.isclose(self.occupancy, other.occupancy)
            and self.altloc == other.altloc
            and self.fullname == other.fullname
-            and np.allclose(self.coord, other.coord)
+            and (np.allclose(self.coord, other.coord) if compare_coordinates else True)
            if compare_coordinates
            else True
            and getattr(self, "element", None) == getattr(self, "element", None)
            and getattr(self, "pqr_charge", None) == getattr(self, "pqr_charge", None)
            and getattr(self, "radius", None) == getattr(self, "radius", None)
--- a/Bio/PDB/init.py
+++ b/Bio/PDB/init.py
@ -33,7 +33,6 @@ except ImportError:
 # Get a Structure object from a PDB file
 from .PDBParser import PDBParser
 from .MMCIFParser import MMCIFParser
 from .MMCIFParser import FastMMCIFParser
 from .PDBMLParser import PDBMLParser
--- a/Bio/PDB/bcifhelpermodule.c
+++ b/Bio/PDB/bcifhelpermodule.c
@ -0,0 +1,207 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <math.h>
 #include <stdint.h>
 void
 integer_unpack_u8(Py_buffer *in_view, Py_buffer *out_view)
 {
    Py_ssize_t in_size = in_view->shape[0];
    Py_ssize_t in_index = 0;
    Py_ssize_t out_index = 0;
    uint8_t *in_data = in_view->buf;
    uint32_t *out_data = out_view->buf;
    while (in_index < in_size) {
        uint32_t sum = in_data[in_index];
        if (sum == UINT8_MAX) {
            while (in_index + 1 < in_size) {
                in_index += 1;
                sum += in_data[in_index];
                if (in_data[in_index] != UINT8_MAX) {
                    break;
                }
            }
        }
        out_data[out_index] = sum;
        in_index += 1;
        out_index += 1;
    }
 }
 void
 integer_unpack_u16(Py_buffer *in_view, Py_buffer *out_view)
 {
    Py_ssize_t in_size = in_view->shape[0];
    Py_ssize_t in_index = 0;
    Py_ssize_t out_index = 0;
    uint16_t *in_data = in_view->buf;
    uint32_t *out_data = out_view->buf;
    while (in_index < in_size) {
        uint32_t sum = in_data[in_index];
        if (sum == UINT16_MAX) {
            while (in_index + 1 < in_size) {
                in_index += 1;
                sum += in_data[in_index];
                if (in_data[in_index] != UINT16_MAX) {
                    break;
                }
            }
        }
        out_data[out_index] = sum;
        in_index += 1;
        out_index += 1;
    }
 }
 void
 integer_unpack_i8(Py_buffer *in_view, Py_buffer *out_view)
 {
    Py_ssize_t in_size = in_view->shape[0];
    Py_ssize_t in_index = 0;
    Py_ssize_t out_index = 0;
    int8_t *in_data = in_view->buf;
    int32_t *out_data = out_view->buf;
    while (in_index < in_size) {
        int32_t sum = in_data[in_index];
        if (sum == INT8_MAX || sum == INT8_MIN) {
            while (in_index + 1 < in_size) {
                in_index += 1;
                sum += in_data[in_index];
                if (in_data[in_index] != INT8_MAX && in_data[in_index] != INT8_MIN) {
                    break;
                }
            }
        }
        out_data[out_index] = sum;
        in_index += 1;
        out_index += 1;
    }
 }
 void
 integer_unpack_i16(Py_buffer *in_view, Py_buffer *out_view)
 {
    Py_ssize_t in_size = in_view->shape[0];
    Py_ssize_t in_index = 0;
    Py_ssize_t out_index = 0;
    int16_t *in_data = in_view->buf;
    int32_t *out_data = out_view->buf;
    while (in_index < in_size) {
        int32_t sum = in_data[in_index];
        if (sum == INT16_MAX || sum == INT16_MIN) {
            while (in_index + 1 < in_size) {
                in_index += 1;
                sum += in_data[in_index];
                if (in_data[in_index] != INT16_MAX && in_data[in_index] != INT16_MIN) {
                    break;
                }
            }
        }
        out_data[out_index] = sum;
        in_index += 1;
        out_index += 1;
    }
 }
 static PyObject *
 integer_unpack(PyObject *self, PyObject *args)
 {
    PyObject *in = NULL;
    PyObject *out = NULL;
    if (!PyArg_ParseTuple(args, "OO", &in, &out)) {
        return NULL;
    }
    Py_buffer in_view, out_view;
    const int flags = PyBUF_ND | PyBUF_FORMAT;
    if (PyObject_GetBuffer(in, &in_view, flags) != 0) {
        return NULL;
    }
    if (PyObject_GetBuffer(out, &out_view, flags | PyBUF_WRITABLE) != 0) {
        PyBuffer_Release(&in_view);
        return NULL;
    }
    if (in_view.ndim != 1) {
        PyErr_SetString(PyExc_ValueError, "First argument should be one-dimensional.");
        goto exit;
    }
    if (out_view.ndim != 1) {
        PyErr_SetString(PyExc_ValueError, "Second argument should be one-dimensional.");
        goto exit;
    }
    const char format = in_view.format[0];
    if (format == 'B') {
        integer_unpack_u8(&in_view, &out_view);
    }
    else if (format == 'H') {
        integer_unpack_u16(&in_view, &out_view);
    }
    else if (format == 'b') {
        integer_unpack_i8(&in_view, &out_view);
    }
    else if (format == 'h') {
        integer_unpack_i16(&in_view, &out_view);
    }
    else {
        PyErr_Format(PyExc_ValueError,
            "Unexpected buffer format: %s",
            in_view.format);
    }
 exit:
    PyBuffer_Release(&in_view);
    PyBuffer_Release(&out_view);
    Py_INCREF(Py_None);
    return Py_None;
 }
 static PyMethodDef IntegerUnpackMethods[] = {
    {"integer_unpack", integer_unpack, METH_VARARGS, NULL},
    {NULL, NULL, 0, NULL}
 };
 static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "_bcif_helper",
    NULL,
    -1,
    IntegerUnpackMethods
 };
 PyMODINIT_FUNC
 PyInit__bcif_helper(void)
 {
    PyObject *m;
    m = PyModule_Create(&moduledef);
    if (!m) {
        return NULL;
    }
    return m;
 }
--- a/Bio/PDB/binary_cif.py
+++ b/Bio/PDB/binary_cif.py
@ -0,0 +1,306 @@
 """
 A module to interact with BinaryCIF-formatted files.
 """
 import gzip
 from typing import Optional
 import numpy as np
 from collections import deque
 try:
    import msgpack
 except ImportError:
    from Bio import MissingPythonDependencyError
    raise MissingPythonDependencyError(
        "Install msgpack to use Bio.PDB.binaryCIF (e.g. pip install msgpack)"
    ) from None
 import Bio.PDB._bcif_helper as _bcif_helper
 from Bio.PDB.Structure import Structure
 from Bio.PDB.StructureBuilder import StructureBuilder
 # https://github.com/ihmwg/python-ihm/blob/main/ihm/format_bcif.py
 # https://numpy.org/doc/stable/reference/arrays.dtypes.html#
 # The "<" tells NumPy to use little endian representation.
 # BinaryCIF always uses little endian.
 _dtypes = {
    1: np.dtype("<i1"),  # Int8
    2: np.dtype("<i2"),  # Int16
    3: np.dtype("<i4"),  # Int32
    4: np.dtype("<u1"),  # UInt8
    5: np.dtype("<u2"),  # UInt16
    6: np.dtype("<u4"),  # UInt32
    32: np.dtype("<f4"),  # Float32
    33: np.dtype("<f8"),  # Float64
 }
 def _byte_array_decoder(column):
    encoding = column["data"]["encoding"][-1]
    assert encoding["kind"] == "ByteArray"
    dtype = _dtypes[encoding["type"]]
    column["data"]["data"] = np.frombuffer(column["data"]["data"], dtype)
    column["data"]["encoding"].pop()
 def _fixed_point_decoder(column):
    encoding = column["data"]["encoding"][-1]
    assert encoding["kind"] == "FixedPoint"
    dtype = _dtypes[encoding["srcType"]]
    factor = encoding["factor"]
    data = column["data"]["data"]
    assert data.dtype.type in (np.int32, np.uint32)
    decoded_data = np.divide(data, factor, dtype=dtype)
    column["data"]["data"] = decoded_data
    column["data"]["encoding"].pop()
 def _interval_quantization_decoder(column):
    encoding = column["data"]["encoding"][-1]
    assert encoding["kind"] == "IntervalQuantization"
    min_val = encoding["min"]
    max_val = encoding["max"]
    num_steps = encoding["num_steps"]
    delta = max_val - min_val / (num_steps - 1)
    data = column["data"]["data"]
    dtype = _dtypes[encoding["srcType"]]
    decoded_data = np.add(min_val, np.multiply(data, delta, dtype=dtype), dtype=dtype)
    column["data"]["data"] = decoded_data
    column["data"]["encoding"].pop()
 def _run_length_decoder(column):
    encoding = column["data"]["encoding"][-1]
    assert encoding["kind"] == "RunLength"
    data = column["data"]["data"]
    dtype = _dtypes[encoding["srcType"]]
    decoded_data = np.repeat(data[::2].astype(dtype), data[1::2])
    assert len(decoded_data) == encoding["srcSize"]
    column["data"]["data"] = decoded_data
    column["data"]["encoding"].pop()
 def _delta_decoder(column):
    encoding = column["data"]["encoding"][-1]
    assert encoding["kind"] == "Delta"
    dtype = _dtypes[encoding["srcType"]]
    data = column["data"]["data"]
    decoded_data = data.astype(dtype, copy=False)
    decoded_data[0] += encoding["origin"]
    decoded_data.cumsum(out=decoded_data)
    column["data"]["data"] = decoded_data
    column["data"]["encoding"].pop()
 def _integer_packing_decoder(column):
    encoding = column["data"]["encoding"][-1]
    assert encoding["kind"] == "IntegerPacking"
    byte_count = encoding["byteCount"]
    src_size = encoding["srcSize"]
    is_unsigned = encoding["isUnsigned"]
    if is_unsigned:
        dtype = np.dtype("<u4")
    else:
        dtype = np.dtype("<i4")
    data = column["data"]["data"]
    assert byte_count == data.dtype.itemsize
    assert np.issubdtype(data.dtype, np.unsignedinteger) == is_unsigned
    decoded_data = np.empty((src_size,), dtype)
    _bcif_helper.integer_unpack(data, decoded_data)
    column["data"]["data"] = decoded_data
    column["data"]["encoding"].pop()
 def _string_array_decoder(column):
    encoding = column["data"]["encoding"][-1]
    assert encoding["kind"] == "StringArray"
    offsets_column = {
        "data": {
            "data": encoding["offsets"],
            "encoding": encoding["offsetEncoding"],
        }
    }
    lookup_column = {
        "data": {
            "data": column["data"]["data"],
            "encoding": encoding["dataEncoding"],
        }
    }
    string_data = encoding["stringData"]
    offsets = _decode(offsets_column)
    unique_strings = np.empty((len(offsets) - 1,), dtype=object)
    for index in range(len(unique_strings)):
        unique_string = string_data[offsets[index] : offsets[index + 1]]
        unique_strings[index] = unique_string
    lookups = _decode(lookup_column)
    column["data"]["data"] = unique_strings[lookups]
    column["data"]["encoding"].pop()
 _decoders = {
    "ByteArray": _byte_array_decoder,
    "FixedPoint": _fixed_point_decoder,
    "IntervalQuantization": _interval_quantization_decoder,
    "RunLength": _run_length_decoder,
    "Delta": _delta_decoder,
    "IntegerPacking": _integer_packing_decoder,
    "StringArray": _string_array_decoder,
 }
 def _decode(column):
    # Note that decode modifies the column.
    encodings = deque(column["data"]["encoding"])
    column["data"]["encoding"] = encodings
    while encodings:
        encoding = encodings[-1]
        _decoders[encoding["kind"]](column)
    return column["data"]["data"]
 class BinaryCIFParser:
    """A parser for BinaryCIF files.
    See the `BinaryCIF specification <https://github.com/molstar/BinaryCIF>`_.
    """
    def __init__(self):
        """Initialize a BinaryCIF parser."""
        self._structure_builder = StructureBuilder()
    def _get_hetero_field(self, atom_group: str, component_id: str) -> str:
        if atom_group == "HETATM":
            hetero_field = "W" if component_id in ("HOH", "WAT") else "H"
        else:
            hetero_field = " "
        return hetero_field
    def _get_residue_ids(self, columns):
        atom_groups = _decode(columns["_atom_site.group_PDB"])
        component_ids = _decode(columns["_atom_site.label_comp_id"])
        hetero_fields = [
            self._get_hetero_field(atom_group, component_id)
            for atom_group, component_id in zip(atom_groups, component_ids)
        ]
        insertion_codes = [
            code or " " for code in _decode(columns["_atom_site.pdbx_PDB_ins_code"])
        ]
        sequence_ids = _decode(columns["_atom_site.auth_seq_id"])
        return list(zip(hetero_fields, sequence_ids, insertion_codes))
    def _get_atoms(self, columns):
        names = _decode(columns["_atom_site.label_atom_id"])
        x_list = _decode(columns["_atom_site.Cartn_x"])
        y_list = _decode(columns["_atom_site.Cartn_y"])
        z_list = _decode(columns["_atom_site.Cartn_z"])
        coordinates_list = np.stack((x_list, y_list, z_list), axis=1)
        b_factors = _decode(columns["_atom_site.B_iso_or_equiv"])
        occupancies = _decode(columns["_atom_site.occupancy"])
        alt_ids = [
            str(alt_id or " ") for alt_id in _decode(columns["_atom_site.label_alt_id"])
        ]
        serial_numbers = _decode(columns["_atom_site.id"])
        type_symbols = _decode(columns["_atom_site.type_symbol"])
        return [
            {
                "name": names[index],
                "coord": coordinates_list[index],
                "b_factor": b_factors[index],
                "occupancy": occupancies[index],
                "altloc": alt_ids[index],
                "fullname": names[index],
                "serial_number": serial_numbers[index],
                "element": type_symbols[index],
            }
            for index in range(len(serial_numbers))
        ]
    def get_structure(self, id: Optional[str], source: str) -> Structure:
        """Parse and return the PDB structure from a BinaryCIF file.
        :param str id: the PDB code for this structure
        :param str source: the path to the BinaryCIF file
        :return: the PDB structure
        :rtype: Bio.PDB.Structure.Structure
        """
        if hasattr(source, "seek"):
            # This resets the source if source is a file handle.
            source.seek(0)
        with (
            gzip.open(source, mode="rb")
            if source.endswith(".gz")
            else open(source, mode="rb")
        ) as file:
            result = msgpack.unpack(file, use_list=True)
        columns = {
            f"{category['name']}.{column['name']}": column
            for data_block in result["dataBlocks"]
            for category in data_block["categories"]
            for column in category["columns"]
        }
        atom_model_numbers = _decode(columns["_atom_site.pdbx_PDB_model_num"])
        atom_chain_ids = _decode(columns["_atom_site.label_asym_id"])
        atom_residue_ids = self._get_residue_ids(columns)
        atom_component_ids = _decode(columns["_atom_site.label_comp_id"])
        atoms = self._get_atoms(columns)
        entry_id = _decode(columns["_entry.id"])[0]
        self._structure_builder.init_structure(id or entry_id)
        builder_model_count = 0
        builder_model_number = None
        builder_chain_id = None
        builder_residue_id = None
        builder_component_id = None
        for index in range(len(atom_model_numbers)):
            model_number = atom_model_numbers[index]
            chain_id = atom_chain_ids[index]
            residue_id = atom_residue_ids[index]
            component_id = atom_component_ids[index]
            if model_number != builder_model_number:
                self._structure_builder.init_model(builder_model_count, model_number)
                builder_model_count += 1
                builder_model_number = model_number
                builder_chain_id = None
                builder_residue_id = None
            if chain_id != builder_chain_id:
                self._structure_builder.init_chain(chain_id)
                builder_chain_id = chain_id
                builder_residue_id = None
            if residue_id != builder_residue_id or component_id != builder_component_id:
                self._structure_builder.init_residue(component_id, *residue_id)
                builder_residue_id = residue_id
                builder_component_id = component_id
            self._structure_builder.init_atom(**atoms[index])
        return self._structure_builder.get_structure()
--- a/Doc/Tutorial/chapter_pdb.rst
+++ b/Doc/Tutorial/chapter_pdb.rst
@ -62,6 +62,28 @@ Example: get the list of the :math:`y` coordinates of all atoms
   >>> y_list = mmcif_dict["_atom_site.Cartn_y"]
 Reading a BinaryCIF file
 ~~~~~~~~~~~~~~~~~~~~~~~~
 Create a ``BinaryCIFParser`` object:
 .. doctest ../Tests/PDB lib:numpy lib:msgpack
 .. code:: pycon
   >>> from Bio.PDB.binary_cif import BinaryCIFParser
   >>> parser = BinaryCIFParser()
 Call ``get_structure`` with the path to the BinaryCIF file:
 .. cont-doctest ../Tests/PDB lib:numpy lib:msgpack
 .. code:: pycon
   >>> parser.get_structure("1GBT", "1gbt.bcif.gz")
   <Structure id=1GBT>
 Reading files in the MMTF format
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/NEWS.rst
+++ b/NEWS.rst
@ -68,6 +68,12 @@ A parser has been added for parsing PDBML (PDB XML) files.
 `PDBML <https://pdbml.wwpdb.org/>`_ is a representation of PDB data in XML format.
 The PDB chapter of the tutorial is updated to show how to use the PDBML parser.
 Additionally, a parser has been added for BinaryCIF files.
 BinaryCIF is a compact, binary representation of CIF data.
 The PDB tutorial is updated to show how to use the BinaryCIF parser.
 The RCSB PDB recommends that users switch from MMTF to BinaryCIF.
 See the `announcement <https://www.rcsb.org/news/feature/65a1af31c76ca3abcc925d0c>`_.
 Bio.PDB Structure objects will now issue a warning - instead of an exception - when
 two children (e.g. residues) have identical IDs. This can be useful in some
 cases, e.g. renumbering residues in a chain.
--- a/Tests/PDB/1gbt.bcif.gz
+++ b/Tests/PDB/1gbt.bcif.gz
--- a/Tests/PDB/3jqh.bcif.gz
+++ b/Tests/PDB/3jqh.bcif.gz
--- a/Tests/PDB/6wg6.bcif.gz
+++ b/Tests/PDB/6wg6.bcif.gz
--- a/Tests/test_PDB_SMCRA.py
+++ b/Tests/test_PDB_SMCRA.py
@ -159,6 +159,14 @@ class SortingTests(unittest.TestCase):
            structure2.strictly_equals(structure)
        )  # Strict equality should be symmetric
        # Modify an atom
        structure2[0]["A"][(" ", 200, " ")]["CA"].name = "AC"
        self.assertFalse(structure.strictly_equals(structure2))
        self.assertFalse(
            structure2.strictly_equals(structure)
        )  # Strict equality should be symmetric
        # Remove a chain from a model in the structure
        structure2[0].detach_child("A")
--- a/Tests/test_PDB_binary_cif.py
+++ b/Tests/test_PDB_binary_cif.py
@ -0,0 +1,25 @@
 """
 Tests for BinaryCIF code in the PDB package.
 """
 import unittest
 from Bio.PDB import MMCIFParser
 from Bio.PDB.binary_cif import BinaryCIFParser
 class TestBinaryCIFParser(unittest.TestCase):
    def test_get_structure(self):
        mmcif_parser = MMCIFParser(auth_chains=False)
        bcif_parser = BinaryCIFParser()
        for entry in ["1GBT", "6WG6", "3JQH"]:
            mmcif_structure = mmcif_parser.get_structure(entry, f"PDB/{entry}.cif")
            bcif_structure = bcif_parser.get_structure(
                entry, f"PDB/{entry.lower()}.bcif.gz"
            )
            self.assertTrue(
                mmcif_structure.strictly_equals(
                    bcif_structure, compare_coordinates=True
                )
            )
--- a/setup.py
+++ b/setup.py
@ -193,8 +193,9 @@ EXTENSIONS = [
    Extension(
        "Bio.Cluster._cluster", ["Bio/Cluster/cluster.c", "Bio/Cluster/clustermodule.c"]
    ),
    Extension("Bio.PDB.kdtrees", ["Bio/PDB/kdtrees.c"]),
    Extension("Bio.PDB.ccealign", ["Bio/PDB/ccealignmodule.c"]),
    Extension("Bio.PDB.kdtrees", ["Bio/PDB/kdtrees.c"]),
    Extension("Bio.PDB._bcif_helper", ["Bio/PDB/bcifhelpermodule.c"]),
    Extension("Bio.SeqIO._twoBitIO", ["Bio/SeqIO/_twoBitIO.c"]),
 ]