Add BinaryCIF parser (#4707)

Also fixes a structure comparison bug
This commit is contained in:
Will Tyler
2024-06-11 03:35:46 +00:00
committed by GitHub
parent 5d3189e689
commit 4fe6640e46
12 changed files with 579 additions and 7 deletions

View File

@ -290,13 +290,11 @@ class Atom:
return ( return (
self.name == other.name self.name == other.name
and self.bfactor == other.bfactor and np.isclose(self.bfactor, other.bfactor)
and self.occupancy == other.occupancy and np.isclose(self.occupancy, other.occupancy)
and self.altloc == other.altloc and self.altloc == other.altloc
and self.fullname == other.fullname and self.fullname == other.fullname
and np.allclose(self.coord, other.coord) and (np.allclose(self.coord, other.coord) if compare_coordinates else True)
if compare_coordinates
else True
and getattr(self, "element", None) == getattr(self, "element", None) and getattr(self, "element", None) == getattr(self, "element", None)
and getattr(self, "pqr_charge", None) == getattr(self, "pqr_charge", None) and getattr(self, "pqr_charge", None) == getattr(self, "pqr_charge", None)
and getattr(self, "radius", None) == getattr(self, "radius", None) and getattr(self, "radius", None) == getattr(self, "radius", None)

View File

@ -33,7 +33,6 @@ except ImportError:
# Get a Structure object from a PDB file # Get a Structure object from a PDB file
from .PDBParser import PDBParser from .PDBParser import PDBParser
from .MMCIFParser import MMCIFParser from .MMCIFParser import MMCIFParser
from .MMCIFParser import FastMMCIFParser from .MMCIFParser import FastMMCIFParser
from .PDBMLParser import PDBMLParser from .PDBMLParser import PDBMLParser

207
Bio/PDB/bcifhelpermodule.c Normal file
View File

@ -0,0 +1,207 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <math.h>
#include <stdint.h>
void
integer_unpack_u8(Py_buffer *in_view, Py_buffer *out_view)
{
Py_ssize_t in_size = in_view->shape[0];
Py_ssize_t in_index = 0;
Py_ssize_t out_index = 0;
uint8_t *in_data = in_view->buf;
uint32_t *out_data = out_view->buf;
while (in_index < in_size) {
uint32_t sum = in_data[in_index];
if (sum == UINT8_MAX) {
while (in_index + 1 < in_size) {
in_index += 1;
sum += in_data[in_index];
if (in_data[in_index] != UINT8_MAX) {
break;
}
}
}
out_data[out_index] = sum;
in_index += 1;
out_index += 1;
}
}
void
integer_unpack_u16(Py_buffer *in_view, Py_buffer *out_view)
{
Py_ssize_t in_size = in_view->shape[0];
Py_ssize_t in_index = 0;
Py_ssize_t out_index = 0;
uint16_t *in_data = in_view->buf;
uint32_t *out_data = out_view->buf;
while (in_index < in_size) {
uint32_t sum = in_data[in_index];
if (sum == UINT16_MAX) {
while (in_index + 1 < in_size) {
in_index += 1;
sum += in_data[in_index];
if (in_data[in_index] != UINT16_MAX) {
break;
}
}
}
out_data[out_index] = sum;
in_index += 1;
out_index += 1;
}
}
void
integer_unpack_i8(Py_buffer *in_view, Py_buffer *out_view)
{
Py_ssize_t in_size = in_view->shape[0];
Py_ssize_t in_index = 0;
Py_ssize_t out_index = 0;
int8_t *in_data = in_view->buf;
int32_t *out_data = out_view->buf;
while (in_index < in_size) {
int32_t sum = in_data[in_index];
if (sum == INT8_MAX || sum == INT8_MIN) {
while (in_index + 1 < in_size) {
in_index += 1;
sum += in_data[in_index];
if (in_data[in_index] != INT8_MAX && in_data[in_index] != INT8_MIN) {
break;
}
}
}
out_data[out_index] = sum;
in_index += 1;
out_index += 1;
}
}
void
integer_unpack_i16(Py_buffer *in_view, Py_buffer *out_view)
{
Py_ssize_t in_size = in_view->shape[0];
Py_ssize_t in_index = 0;
Py_ssize_t out_index = 0;
int16_t *in_data = in_view->buf;
int32_t *out_data = out_view->buf;
while (in_index < in_size) {
int32_t sum = in_data[in_index];
if (sum == INT16_MAX || sum == INT16_MIN) {
while (in_index + 1 < in_size) {
in_index += 1;
sum += in_data[in_index];
if (in_data[in_index] != INT16_MAX && in_data[in_index] != INT16_MIN) {
break;
}
}
}
out_data[out_index] = sum;
in_index += 1;
out_index += 1;
}
}
static PyObject *
integer_unpack(PyObject *self, PyObject *args)
{
PyObject *in = NULL;
PyObject *out = NULL;
if (!PyArg_ParseTuple(args, "OO", &in, &out)) {
return NULL;
}
Py_buffer in_view, out_view;
const int flags = PyBUF_ND | PyBUF_FORMAT;
if (PyObject_GetBuffer(in, &in_view, flags) != 0) {
return NULL;
}
if (PyObject_GetBuffer(out, &out_view, flags | PyBUF_WRITABLE) != 0) {
PyBuffer_Release(&in_view);
return NULL;
}
if (in_view.ndim != 1) {
PyErr_SetString(PyExc_ValueError, "First argument should be one-dimensional.");
goto exit;
}
if (out_view.ndim != 1) {
PyErr_SetString(PyExc_ValueError, "Second argument should be one-dimensional.");
goto exit;
}
const char format = in_view.format[0];
if (format == 'B') {
integer_unpack_u8(&in_view, &out_view);
}
else if (format == 'H') {
integer_unpack_u16(&in_view, &out_view);
}
else if (format == 'b') {
integer_unpack_i8(&in_view, &out_view);
}
else if (format == 'h') {
integer_unpack_i16(&in_view, &out_view);
}
else {
PyErr_Format(PyExc_ValueError,
"Unexpected buffer format: %s",
in_view.format);
}
exit:
PyBuffer_Release(&in_view);
PyBuffer_Release(&out_view);
Py_INCREF(Py_None);
return Py_None;
}
static PyMethodDef IntegerUnpackMethods[] = {
{"integer_unpack", integer_unpack, METH_VARARGS, NULL},
{NULL, NULL, 0, NULL}
};
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"_bcif_helper",
NULL,
-1,
IntegerUnpackMethods
};
PyMODINIT_FUNC
PyInit__bcif_helper(void)
{
PyObject *m;
m = PyModule_Create(&moduledef);
if (!m) {
return NULL;
}
return m;
}

306
Bio/PDB/binary_cif.py Normal file
View File

@ -0,0 +1,306 @@
"""
A module to interact with BinaryCIF-formatted files.
"""
import gzip
from typing import Optional
import numpy as np
from collections import deque
try:
import msgpack
except ImportError:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError(
"Install msgpack to use Bio.PDB.binaryCIF (e.g. pip install msgpack)"
) from None
import Bio.PDB._bcif_helper as _bcif_helper
from Bio.PDB.Structure import Structure
from Bio.PDB.StructureBuilder import StructureBuilder
# https://github.com/ihmwg/python-ihm/blob/main/ihm/format_bcif.py
# https://numpy.org/doc/stable/reference/arrays.dtypes.html#
# The "<" tells NumPy to use little endian representation.
# BinaryCIF always uses little endian.
_dtypes = {
1: np.dtype("<i1"), # Int8
2: np.dtype("<i2"), # Int16
3: np.dtype("<i4"), # Int32
4: np.dtype("<u1"), # UInt8
5: np.dtype("<u2"), # UInt16
6: np.dtype("<u4"), # UInt32
32: np.dtype("<f4"), # Float32
33: np.dtype("<f8"), # Float64
}
def _byte_array_decoder(column):
encoding = column["data"]["encoding"][-1]
assert encoding["kind"] == "ByteArray"
dtype = _dtypes[encoding["type"]]
column["data"]["data"] = np.frombuffer(column["data"]["data"], dtype)
column["data"]["encoding"].pop()
def _fixed_point_decoder(column):
encoding = column["data"]["encoding"][-1]
assert encoding["kind"] == "FixedPoint"
dtype = _dtypes[encoding["srcType"]]
factor = encoding["factor"]
data = column["data"]["data"]
assert data.dtype.type in (np.int32, np.uint32)
decoded_data = np.divide(data, factor, dtype=dtype)
column["data"]["data"] = decoded_data
column["data"]["encoding"].pop()
def _interval_quantization_decoder(column):
encoding = column["data"]["encoding"][-1]
assert encoding["kind"] == "IntervalQuantization"
min_val = encoding["min"]
max_val = encoding["max"]
num_steps = encoding["num_steps"]
delta = max_val - min_val / (num_steps - 1)
data = column["data"]["data"]
dtype = _dtypes[encoding["srcType"]]
decoded_data = np.add(min_val, np.multiply(data, delta, dtype=dtype), dtype=dtype)
column["data"]["data"] = decoded_data
column["data"]["encoding"].pop()
def _run_length_decoder(column):
encoding = column["data"]["encoding"][-1]
assert encoding["kind"] == "RunLength"
data = column["data"]["data"]
dtype = _dtypes[encoding["srcType"]]
decoded_data = np.repeat(data[::2].astype(dtype), data[1::2])
assert len(decoded_data) == encoding["srcSize"]
column["data"]["data"] = decoded_data
column["data"]["encoding"].pop()
def _delta_decoder(column):
encoding = column["data"]["encoding"][-1]
assert encoding["kind"] == "Delta"
dtype = _dtypes[encoding["srcType"]]
data = column["data"]["data"]
decoded_data = data.astype(dtype, copy=False)
decoded_data[0] += encoding["origin"]
decoded_data.cumsum(out=decoded_data)
column["data"]["data"] = decoded_data
column["data"]["encoding"].pop()
def _integer_packing_decoder(column):
encoding = column["data"]["encoding"][-1]
assert encoding["kind"] == "IntegerPacking"
byte_count = encoding["byteCount"]
src_size = encoding["srcSize"]
is_unsigned = encoding["isUnsigned"]
if is_unsigned:
dtype = np.dtype("<u4")
else:
dtype = np.dtype("<i4")
data = column["data"]["data"]
assert byte_count == data.dtype.itemsize
assert np.issubdtype(data.dtype, np.unsignedinteger) == is_unsigned
decoded_data = np.empty((src_size,), dtype)
_bcif_helper.integer_unpack(data, decoded_data)
column["data"]["data"] = decoded_data
column["data"]["encoding"].pop()
def _string_array_decoder(column):
encoding = column["data"]["encoding"][-1]
assert encoding["kind"] == "StringArray"
offsets_column = {
"data": {
"data": encoding["offsets"],
"encoding": encoding["offsetEncoding"],
}
}
lookup_column = {
"data": {
"data": column["data"]["data"],
"encoding": encoding["dataEncoding"],
}
}
string_data = encoding["stringData"]
offsets = _decode(offsets_column)
unique_strings = np.empty((len(offsets) - 1,), dtype=object)
for index in range(len(unique_strings)):
unique_string = string_data[offsets[index] : offsets[index + 1]]
unique_strings[index] = unique_string
lookups = _decode(lookup_column)
column["data"]["data"] = unique_strings[lookups]
column["data"]["encoding"].pop()
_decoders = {
"ByteArray": _byte_array_decoder,
"FixedPoint": _fixed_point_decoder,
"IntervalQuantization": _interval_quantization_decoder,
"RunLength": _run_length_decoder,
"Delta": _delta_decoder,
"IntegerPacking": _integer_packing_decoder,
"StringArray": _string_array_decoder,
}
def _decode(column):
# Note that decode modifies the column.
encodings = deque(column["data"]["encoding"])
column["data"]["encoding"] = encodings
while encodings:
encoding = encodings[-1]
_decoders[encoding["kind"]](column)
return column["data"]["data"]
class BinaryCIFParser:
"""A parser for BinaryCIF files.
See the `BinaryCIF specification <https://github.com/molstar/BinaryCIF>`_.
"""
def __init__(self):
"""Initialize a BinaryCIF parser."""
self._structure_builder = StructureBuilder()
def _get_hetero_field(self, atom_group: str, component_id: str) -> str:
if atom_group == "HETATM":
hetero_field = "W" if component_id in ("HOH", "WAT") else "H"
else:
hetero_field = " "
return hetero_field
def _get_residue_ids(self, columns):
atom_groups = _decode(columns["_atom_site.group_PDB"])
component_ids = _decode(columns["_atom_site.label_comp_id"])
hetero_fields = [
self._get_hetero_field(atom_group, component_id)
for atom_group, component_id in zip(atom_groups, component_ids)
]
insertion_codes = [
code or " " for code in _decode(columns["_atom_site.pdbx_PDB_ins_code"])
]
sequence_ids = _decode(columns["_atom_site.auth_seq_id"])
return list(zip(hetero_fields, sequence_ids, insertion_codes))
def _get_atoms(self, columns):
names = _decode(columns["_atom_site.label_atom_id"])
x_list = _decode(columns["_atom_site.Cartn_x"])
y_list = _decode(columns["_atom_site.Cartn_y"])
z_list = _decode(columns["_atom_site.Cartn_z"])
coordinates_list = np.stack((x_list, y_list, z_list), axis=1)
b_factors = _decode(columns["_atom_site.B_iso_or_equiv"])
occupancies = _decode(columns["_atom_site.occupancy"])
alt_ids = [
str(alt_id or " ") for alt_id in _decode(columns["_atom_site.label_alt_id"])
]
serial_numbers = _decode(columns["_atom_site.id"])
type_symbols = _decode(columns["_atom_site.type_symbol"])
return [
{
"name": names[index],
"coord": coordinates_list[index],
"b_factor": b_factors[index],
"occupancy": occupancies[index],
"altloc": alt_ids[index],
"fullname": names[index],
"serial_number": serial_numbers[index],
"element": type_symbols[index],
}
for index in range(len(serial_numbers))
]
def get_structure(self, id: Optional[str], source: str) -> Structure:
"""Parse and return the PDB structure from a BinaryCIF file.
:param str id: the PDB code for this structure
:param str source: the path to the BinaryCIF file
:return: the PDB structure
:rtype: Bio.PDB.Structure.Structure
"""
if hasattr(source, "seek"):
# This resets the source if source is a file handle.
source.seek(0)
with (
gzip.open(source, mode="rb")
if source.endswith(".gz")
else open(source, mode="rb")
) as file:
result = msgpack.unpack(file, use_list=True)
columns = {
f"{category['name']}.{column['name']}": column
for data_block in result["dataBlocks"]
for category in data_block["categories"]
for column in category["columns"]
}
atom_model_numbers = _decode(columns["_atom_site.pdbx_PDB_model_num"])
atom_chain_ids = _decode(columns["_atom_site.label_asym_id"])
atom_residue_ids = self._get_residue_ids(columns)
atom_component_ids = _decode(columns["_atom_site.label_comp_id"])
atoms = self._get_atoms(columns)
entry_id = _decode(columns["_entry.id"])[0]
self._structure_builder.init_structure(id or entry_id)
builder_model_count = 0
builder_model_number = None
builder_chain_id = None
builder_residue_id = None
builder_component_id = None
for index in range(len(atom_model_numbers)):
model_number = atom_model_numbers[index]
chain_id = atom_chain_ids[index]
residue_id = atom_residue_ids[index]
component_id = atom_component_ids[index]
if model_number != builder_model_number:
self._structure_builder.init_model(builder_model_count, model_number)
builder_model_count += 1
builder_model_number = model_number
builder_chain_id = None
builder_residue_id = None
if chain_id != builder_chain_id:
self._structure_builder.init_chain(chain_id)
builder_chain_id = chain_id
builder_residue_id = None
if residue_id != builder_residue_id or component_id != builder_component_id:
self._structure_builder.init_residue(component_id, *residue_id)
builder_residue_id = residue_id
builder_component_id = component_id
self._structure_builder.init_atom(**atoms[index])
return self._structure_builder.get_structure()

View File

@ -62,6 +62,28 @@ Example: get the list of the :math:`y` coordinates of all atoms
>>> y_list = mmcif_dict["_atom_site.Cartn_y"] >>> y_list = mmcif_dict["_atom_site.Cartn_y"]
Reading a BinaryCIF file
~~~~~~~~~~~~~~~~~~~~~~~~
Create a ``BinaryCIFParser`` object:
.. doctest ../Tests/PDB lib:numpy lib:msgpack
.. code:: pycon
>>> from Bio.PDB.binary_cif import BinaryCIFParser
>>> parser = BinaryCIFParser()
Call ``get_structure`` with the path to the BinaryCIF file:
.. cont-doctest ../Tests/PDB lib:numpy lib:msgpack
.. code:: pycon
>>> parser.get_structure("1GBT", "1gbt.bcif.gz")
<Structure id=1GBT>
Reading files in the MMTF format Reading files in the MMTF format
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -68,6 +68,12 @@ A parser has been added for parsing PDBML (PDB XML) files.
`PDBML <https://pdbml.wwpdb.org/>`_ is a representation of PDB data in XML format. `PDBML <https://pdbml.wwpdb.org/>`_ is a representation of PDB data in XML format.
The PDB chapter of the tutorial is updated to show how to use the PDBML parser. The PDB chapter of the tutorial is updated to show how to use the PDBML parser.
Additionally, a parser has been added for BinaryCIF files.
BinaryCIF is a compact, binary representation of CIF data.
The PDB tutorial is updated to show how to use the BinaryCIF parser.
The RCSB PDB recommends that users switch from MMTF to BinaryCIF.
See the `announcement <https://www.rcsb.org/news/feature/65a1af31c76ca3abcc925d0c>`_.
Bio.PDB Structure objects will now issue a warning - instead of an exception - when Bio.PDB Structure objects will now issue a warning - instead of an exception - when
two children (e.g. residues) have identical IDs. This can be useful in some two children (e.g. residues) have identical IDs. This can be useful in some
cases, e.g. renumbering residues in a chain. cases, e.g. renumbering residues in a chain.

BIN
Tests/PDB/1gbt.bcif.gz Normal file

Binary file not shown.

BIN
Tests/PDB/3jqh.bcif.gz Normal file

Binary file not shown.

BIN
Tests/PDB/6wg6.bcif.gz Normal file

Binary file not shown.

View File

@ -159,6 +159,14 @@ class SortingTests(unittest.TestCase):
structure2.strictly_equals(structure) structure2.strictly_equals(structure)
) # Strict equality should be symmetric ) # Strict equality should be symmetric
# Modify an atom
structure2[0]["A"][(" ", 200, " ")]["CA"].name = "AC"
self.assertFalse(structure.strictly_equals(structure2))
self.assertFalse(
structure2.strictly_equals(structure)
) # Strict equality should be symmetric
# Remove a chain from a model in the structure # Remove a chain from a model in the structure
structure2[0].detach_child("A") structure2[0].detach_child("A")

View File

@ -0,0 +1,25 @@
"""
Tests for BinaryCIF code in the PDB package.
"""
import unittest
from Bio.PDB import MMCIFParser
from Bio.PDB.binary_cif import BinaryCIFParser
class TestBinaryCIFParser(unittest.TestCase):
def test_get_structure(self):
mmcif_parser = MMCIFParser(auth_chains=False)
bcif_parser = BinaryCIFParser()
for entry in ["1GBT", "6WG6", "3JQH"]:
mmcif_structure = mmcif_parser.get_structure(entry, f"PDB/{entry}.cif")
bcif_structure = bcif_parser.get_structure(
entry, f"PDB/{entry.lower()}.bcif.gz"
)
self.assertTrue(
mmcif_structure.strictly_equals(
bcif_structure, compare_coordinates=True
)
)

View File

@ -193,8 +193,9 @@ EXTENSIONS = [
Extension( Extension(
"Bio.Cluster._cluster", ["Bio/Cluster/cluster.c", "Bio/Cluster/clustermodule.c"] "Bio.Cluster._cluster", ["Bio/Cluster/cluster.c", "Bio/Cluster/clustermodule.c"]
), ),
Extension("Bio.PDB.kdtrees", ["Bio/PDB/kdtrees.c"]),
Extension("Bio.PDB.ccealign", ["Bio/PDB/ccealignmodule.c"]), Extension("Bio.PDB.ccealign", ["Bio/PDB/ccealignmodule.c"]),
Extension("Bio.PDB.kdtrees", ["Bio/PDB/kdtrees.c"]),
Extension("Bio.PDB._bcif_helper", ["Bio/PDB/bcifhelpermodule.c"]),
Extension("Bio.SeqIO._twoBitIO", ["Bio/SeqIO/_twoBitIO.c"]), Extension("Bio.SeqIO._twoBitIO", ["Bio/SeqIO/_twoBitIO.c"]),
] ]