Fixes formatting for extremely high B-factor values in PDB files (#5047)

This commit is contained in:
Oliver Wissett
2025-09-03 15:05:07 +01:00
committed by GitHub
parent d1c4b3d0e2
commit d3de878863
5 changed files with 100 additions and 9 deletions

View File

@ -28,3 +28,7 @@ class PDBConstructionWarning(BiopythonWarning):
# The SMCRA structure could not be written to file # The SMCRA structure could not be written to file
class PDBIOException(Exception): class PDBIOException(Exception):
"""Define class PDBIOException.""" """Define class PDBIOException."""
class PDBIOWarning(BiopythonWarning):
"""Define class PDBIOWarning."""

View File

@ -8,14 +8,12 @@
import os import os
import warnings import warnings
from Bio import BiopythonWarning
from Bio.Data.IUPACData import atom_weights from Bio.Data.IUPACData import atom_weights
from Bio.PDB.PDBExceptions import PDBIOException from Bio.PDB.PDBExceptions import PDBIOException, PDBIOWarning
from Bio.PDB.StructureBuilder import StructureBuilder from Bio.PDB.StructureBuilder import StructureBuilder
_ATOM_FORMAT_STRING = ( _ATOM_FORMAT_STRING = "%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f%s%s %4s%2s%2s\n"
"%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f%s%6.2f %4s%2s%2s\n"
)
_PQR_ATOM_FORMAT_STRING = ( _PQR_ATOM_FORMAT_STRING = (
"%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f %7s %6s %2s\n" "%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f %7s %6s %2s\n"
) )
@ -24,6 +22,42 @@ _TER_FORMAT_STRING = (
"TER %5i %3s %c%4i%c \n" "TER %5i %3s %c%4i%c \n"
) )
_MAX_B_FACTOR_2DP = 1_000
_MAX_B_FACTOR_1DP = 10_000
_MAX_B_FACTOR = 999_999
def _format_b_factor(value: float) -> str:
"""Returns the b-factor value as a formatted string
Formats the b-factor value to fit the wwPDB specification of 6 characters
maximum, otherwise truncating to 6 characters if necessary.
"""
_bfactor_str = ""
if value < _MAX_B_FACTOR_2DP:
if len(f"{value:.2f}") > 6:
_bfactor_str = f"{value:6.1f}"
else:
_bfactor_str = f"{value:6.2f}"
elif value < _MAX_B_FACTOR_1DP:
if len(f"{value:.1f}") > 6:
_bfactor_str = f"{value:6.0f}"
else:
_bfactor_str = f"{value:6.1f}"
elif value < _MAX_B_FACTOR:
_bfactor_str = f"{int(value):6d}"
else:
warnings.warn(
f"Truncated bfactor {value!r} to {_MAX_B_FACTOR} to fit wwPDB spec."
" Consider using mmCIF instead!",
PDBIOWarning,
)
_bfactor_str = f"{_MAX_B_FACTOR:6d}"
assert len(_bfactor_str) <= 6
return _bfactor_str
class Select: class Select:
"""Select everything for PDB output (for use as a base class). """Select everything for PDB output (for use as a base class).
@ -214,13 +248,15 @@ class PDBIO(StructureIO):
occupancy = " " * 6 occupancy = " " * 6
warnings.warn( warnings.warn(
f"Missing occupancy in atom {atom.full_id!r} written as blank", f"Missing occupancy in atom {atom.full_id!r} written as blank",
BiopythonWarning, PDBIOWarning,
) )
else: else:
raise ValueError( raise ValueError(
f"Invalid occupancy value: {atom.occupancy!r}" f"Invalid occupancy value: {atom.occupancy!r}"
) from None ) from None
bfactor = _format_b_factor(bfactor)
args = ( args = (
record_type, record_type,
atom_number, atom_number,
@ -239,6 +275,7 @@ class PDBIO(StructureIO):
element, element,
charge, charge,
) )
return _ATOM_FORMAT_STRING % args return _ATOM_FORMAT_STRING % args
# Write PQR format line # Write PQR format line
@ -250,7 +287,7 @@ class PDBIO(StructureIO):
pqr_charge = " " * 7 pqr_charge = " " * 7
warnings.warn( warnings.warn(
f"Missing PQR charge in atom {atom.full_id} written as blank", f"Missing PQR charge in atom {atom.full_id} written as blank",
BiopythonWarning, PDBIOWarning,
) )
else: else:
raise ValueError( raise ValueError(
@ -264,7 +301,7 @@ class PDBIO(StructureIO):
radius = " " * 6 radius = " " * 6
warnings.warn( warnings.warn(
f"Missing radius in atom {atom.full_id} written as blank", f"Missing radius in atom {atom.full_id} written as blank",
BiopythonWarning, PDBIOWarning,
) )
else: else:
raise ValueError(f"Invalid radius value: {atom.radius}") from None raise ValueError(f"Invalid radius value: {atom.radius}") from None

View File

@ -270,6 +270,7 @@ please open an issue on GitHub or mention it on the mailing list.
- Nicolas Fontrodona <https://github.com/NFontrodona> - Nicolas Fontrodona <https://github.com/NFontrodona>
- Nigel Delaney <https://github.com/evolvedmicrobe> - Nigel Delaney <https://github.com/evolvedmicrobe>
- Noam Kremen <https://github.com/noamkremen> - Noam Kremen <https://github.com/noamkremen>
- Oliver Wissett <https://github.com/OWissett>
- Olivier Morelle <https://github.com/Oli4> - Olivier Morelle <https://github.com/Oli4>
- Oscar G. Garcia <https://github.com/oscarmaestre> - Oscar G. Garcia <https://github.com/oscarmaestre>
- Osvaldo Zagordi <https://github.com/ozagordi> - Osvaldo Zagordi <https://github.com/ozagordi>

View File

@ -20,9 +20,20 @@ has also been tested on PyPy3.10 v7.3.17.
`Infernal <http://eddylab.org/infernal/>` (v1.0.0+) RNA search tool. The `Infernal <http://eddylab.org/infernal/>` (v1.0.0+) RNA search tool. The
format are ``infernal-tab`` and ``infernal-text``. format are ``infernal-tab`` and ``infernal-text``.
``Bio.PDB.PDBIO`` now ensures that b-factor values are always at most 6 characters to
ensure that we do not violate the wwPDB specification. This should not have an impact
on the majority of uses, as b-factor values are generally small (less than 100). When
1000 \<= b-factor \< 10_000, the value is rounded to a single decimal place. When,
10_000 \<= b-factor \< 999_999, the value is rounded to zero decimal place. Values above
999_999 are now clamped. The justification for this is the rise in the b-factor field
being used for additional metadata, typically from computational tools.
``Bio.PDB.PDBIO`` will now raise module specific warnings: ``Bio.PDB.PDBExceptions.PDBIOWarning``.
Many thanks to the Biopython developers and community for making this release Many thanks to the Biopython developers and community for making this release
possible, especially the following contributors: possible, especially the following contributors:
- Oliver Wissett (first contribution)
- Samuel Prince (first contribution) - Samuel Prince (first contribution)
15 January 2025: Biopython 1.85 15 January 2025: Biopython 1.85

View File

@ -26,6 +26,8 @@ from Bio.PDB import Residue
from Bio.PDB import Select from Bio.PDB import Select
from Bio.PDB.PDBExceptions import PDBConstructionWarning from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.PDB.PDBExceptions import PDBIOException from Bio.PDB.PDBExceptions import PDBIOException
from Bio.PDB.PDBExceptions import PDBIOWarning
from Bio.PDB.PDBIO import _MAX_B_FACTOR
class WriteTest(unittest.TestCase): class WriteTest(unittest.TestCase):
@ -381,6 +383,42 @@ class WriteTest(unittest.TestCase):
self.io.save(filename) self.io.save(filename)
self.assertFalse(os.path.exists(filename)) self.assertFalse(os.path.exists(filename))
def test_pdbio_write_high_b_factor(self):
def check_pdb(filename, expected_bfactor):
struct = self.parser.get_structure("high_b", filename)
atom = next(struct.get_atoms())
self.assertEqual(atom.bfactor, expected_bfactor)
def test_b_factor(
b_factor: float, expected_bfactor: float, assert_warn: bool = False
) -> None:
struct = self.structure
atom = next(struct.get_atoms())
atom.bfactor = b_factor
self.io.set_structure(struct)
filenumber, filename = tempfile.mkstemp()
os.close(filenumber)
if assert_warn:
with self.assertWarns(PDBIOWarning):
self.io.save(filename)
else:
self.io.save(filename)
try:
check_pdb(filename, expected_bfactor)
finally:
os.remove(filename)
test_b_factor(99.999999, 100)
test_b_factor(999.99999, 1000)
test_b_factor(9999.9999, 10000)
test_b_factor(424.4242, 424.42)
test_b_factor(4242.4242, 4242.4)
test_b_factor(42424.4242, 42424)
test_b_factor(_MAX_B_FACTOR + 10, _MAX_B_FACTOR, assert_warn=True)
# Test revert_write # Test revert_write
def test_pdbio_revert_write_on_filename(self): def test_pdbio_revert_write_on_filename(self):
"""Test removing file when exception is caught (string).""" """Test removing file when exception is caught (string)."""