PQR Parser (#2338)

* Add a script for testing/linting changes before commiting

This script enables selective running of existing tests, performs lint
checks and a doctest check.
To be removed after finalizing the pull request.

* Modify Atom class to support radius and charge

* Enable Structure Builder to create Atom objects from PQR files

* Changes in PDBIO to read and write PQR files

* Update PDBIO.py

Whitespace and comment changes to comply with linter

* Update StructureBuilder.py

Previous version didn't properly initialize Atom instances when parsing
PQR files.

* Fix bug in PDBIO.py

Previous _PQR_ATOM_FORMAT_STRING had an error and as a result PDBIO
couldn't export .pqr files.

* Add PQRParser.py for parsing .pqr files.

* Add unit tests for PQR parser module.

* Include PQR tests in the testing shell script.

* Delete script for quick testing.

* Add names to the CONTRIB.rst

* Fix linting errors.

* Change implementation to a flag based one to avoid duplicate code.

Added the is_pqr flag to PDBParser.py and StructureBuilder.py. Now
_init_atom_from_pqr() and PQRParser class are no longer required to
parse .pqr files.

TODO: Completely remove them in a future commit.

* Modify PQR test to use the PDBParser class with the flag

TODO: rename the file to test_PQR since PQRParser will be deleted in the
future.

* Add test_PQR.sh shell script for testing modifications

Included the PDBParser.py and the unittest in files to lint.
TODO: Completely remove this script before submitting the changes to the
pull request.

* Added documentation and examples on reading and writing PQR files

* Added warning for None atom radius

Renamed "charge" to "pqr_charge" for consistency with Atom, StructureBuilder, PDBParser

* Renamed charge to pqr_charge

Renamed the variable "charge" to "pqr_charge" to avoid confusion.
Explicit mention of PQR file charge and radius fields in comments.

* Debug PDBIO.py

There was an issue in the PDBIO.py (before even this set of corrections)
that caused the PDBIO module to output incorrect .pdb files. The problem
was that the charge field in _get_atom_line() was overwritten in the
case of .pdb files with charge = None which caused the problem. This
issue is now fixed.

In addition, fixed another minor bug in PDBIO and a linting error in
Atom.py

* Delete unnecessary files

* Updated News.rst

*  Fix linting error in NEWS.rst

* Import StringIO from io instead of py3k.

Co-authored-by: Artemi Bendandi <37833668+artbendandi@users.noreply.github.com>
This commit is contained in:
Zisis Konstantinos
2020-01-13 13:32:17 +02:00
committed by mdehoon
parent 8917292afa
commit 8fd67cc501
9 changed files with 1869 additions and 82 deletions

View File

@ -24,6 +24,9 @@ class Atom:
coordinates, B factor, occupancy, alternative location specifier
and (optionally) anisotropic B factor and standard deviations of
B factor and positions.
In the case of PQR files, B factor and occupancy are replaced by
atomic charge and radius.
"""
def __init__(
@ -36,6 +39,8 @@ class Atom:
fullname,
serial_number,
element=None,
pqr_charge=None,
radius=None
):
"""Initialize Atom object.
@ -60,6 +65,12 @@ class Atom:
:param element: atom element, e.g. "C" for Carbon, "HG" for mercury,
:type element: uppercase string (or None if unknown)
:param pqr_charge: atom charge
:type pqr_charge: number
:param radius: atom radius
:type radius: number
"""
self.level = "A"
# Reference to the residue
@ -83,6 +94,8 @@ class Atom:
assert not element or element == element.upper(), element
self.element = self._assign_element(element)
self.mass = self._assign_atom_mass()
self.pqr_charge = pqr_charge
self.radius = radius
# For atom sorting (protein backbone atoms first)
self._sorting_keys = {"N": 0, "CA": 1, "C": 2, "O": 3}
@ -286,6 +299,14 @@ class Atom:
"""
self.anisou_array = anisou_array
def set_charge(self, pqr_charge):
"""Set charge."""
self.pqr_charge = pqr_charge
def set_radius(self, radius):
"""Set radius."""
self.radius = radius
# Public methods
def flag_disorder(self):
@ -373,6 +394,14 @@ class Atom:
"""Return level."""
return self.level
def get_charge(self):
"""Return charge."""
return self.pqr_charge
def get_radius(self):
"""Return radius."""
return self.radius
def transform(self, rot, tran):
"""Apply rotation and translation to the atomic coordinates.

View File

@ -16,6 +16,9 @@ from Bio.Data.IUPACData import atom_weights
_ATOM_FORMAT_STRING = (
"%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f%s%6.2f %4s%2s%2s\n"
)
_PQR_ATOM_FORMAT_STRING = (
"%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f %7s %6s %2s\n"
)
class Select:
@ -97,7 +100,7 @@ class StructureIO:
class PDBIO(StructureIO):
"""Write a Structure object (or a subset of a Structure object) as a PDB file.
"""Write a Structure object (or a subset of a Structure object) as a PDB or PQR file.
Examples
--------
@ -114,15 +117,18 @@ class PDBIO(StructureIO):
"""
def __init__(self, use_model_flag=0):
def __init__(self, use_model_flag=0, is_pqr=False):
"""Create the PDBIO object.
:param use_model_flag: if 1, force use of the MODEL record in output.
:type use_model_flag: int
:param is_pqr: if True, build PQR file. Otherwise build PDB file.
:type is_pqr: Boolean
"""
self.use_model_flag = use_model_flag
self.is_pqr = is_pqr
# private mathods
# private methods
def _get_atom_line(
self,
@ -160,45 +166,100 @@ class PDBIO(StructureIO):
altloc = atom.get_altloc()
x, y, z = atom.get_coord()
bfactor = atom.get_bfactor()
occupancy = atom.get_occupancy()
try:
occupancy_str = "%6.2f" % occupancy
except TypeError:
if occupancy is None:
occupancy_str = " " * 6
import warnings
from Bio import BiopythonWarning
warnings.warn(
"Missing occupancy in atom %s written as blank"
% repr(atom.get_full_id()),
BiopythonWarning,
)
else:
raise TypeError(
"Invalid occupancy %r in atom %r" % (occupancy, atom.get_full_id())
)
# PDB Arguments
if not self.is_pqr:
bfactor = atom.get_bfactor()
occupancy = atom.get_occupancy()
args = (
record_type,
atom_number,
name,
altloc,
resname,
chain_id,
resseq,
icode,
x,
y,
z,
occupancy_str,
bfactor,
segid,
element,
charge,
)
return _ATOM_FORMAT_STRING % args
# PQR Arguments
else:
radius = atom.get_radius()
pqr_charge = atom.get_charge()
if not self.is_pqr:
try:
occupancy_str = "%6.2f" % occupancy
except TypeError:
if occupancy is None:
occupancy_str = " " * 6
import warnings
from Bio import BiopythonWarning
warnings.warn(
"Missing occupancy in atom %s written as blank"
% repr(atom.get_full_id()),
BiopythonWarning,
)
else:
raise TypeError(
"Invalid occupancy %r in atom %r" % (occupancy, atom.get_full_id())
)
args = (
record_type,
atom_number,
name,
altloc,
resname,
chain_id,
resseq,
icode,
x,
y,
z,
occupancy_str,
bfactor,
segid,
element,
charge,
)
return _ATOM_FORMAT_STRING % args
else:
# PQR case
try:
pqr_charge = "%7.4f" % pqr_charge
except TypeError:
if pqr_charge is None:
pqr_charge = " " * 7
import warnings
from Bio import BiopythonWarning
warnings.warn("Missing charge in atom %s written as blank" %
repr(atom.get_full_id()), BiopythonWarning)
else:
raise TypeError("Invalid charge %r in atom %r"
% (pqr_charge, atom.get_full_id()))
try:
radius = "%6.4f" % radius
except TypeError:
if radius is None:
radius = " " * 6
import warnings
from Bio import BiopythonWarning
warnings.warn("Missing radius in atom %s written as blank" %
repr(atom.get_full_id()), BiopythonWarning)
else:
raise TypeError("Invalid radius %r in atom %r"
% (radius, atom.get_full_id()))
args = (
record_type,
atom_number,
name,
altloc,
resname,
chain_id,
resseq,
icode,
x,
y,
z,
pqr_charge,
radius,
element,
)
return _PQR_ATOM_FORMAT_STRING % args
# Public methods

View File

@ -33,7 +33,8 @@ class PDBParser:
"""Parse a PDB file and return a Structure object."""
def __init__(
self, PERMISSIVE=True, get_header=False, structure_builder=None, QUIET=False
self, PERMISSIVE=True, get_header=False, structure_builder=None, QUIET=False,
is_pqr=False
):
"""Create a PDBParser object.
@ -52,6 +53,9 @@ class PDBParser:
- QUIET - Evaluated as a Boolean. If true, warnings issued in constructing
the SMCRA data will be suppressed. If false (DEFAULT), they will be shown.
These warnings might be indicative of problems in the PDB file!
- is_pqr - Evaluated as a Boolean. Specifies the type of file to be parsed.
If false (DEFAULT) a .pdb file format is assumed. Set it to true if you
want to parse a .pqr file instead.
"""
# get_header is not used but is left in for API compatibility
@ -64,6 +68,7 @@ class PDBParser:
self.line_counter = 0
self.PERMISSIVE = bool(PERMISSIVE)
self.QUIET = bool(QUIET)
self.is_pqr = bool(is_pqr)
# Public methods
@ -192,30 +197,53 @@ class PDBParser:
% global_line_counter
)
coord = numpy.array((x, y, z), "f")
# occupancy & B factor
try:
occupancy = float(line[54:60])
except Exception:
self._handle_PDB_exception(
"Invalid or missing occupancy", global_line_counter
)
occupancy = None # Rather than arbitrary zero or one
if occupancy is not None and occupancy < 0:
# TODO - Should this be an error in strict mode?
# self._handle_PDB_exception("Negative occupancy",
# global_line_counter)
# This uses fixed text so the warning occurs once only:
warnings.warn(
"Negative occupancy in one or more atoms",
PDBConstructionWarning,
)
try:
bfactor = float(line[60:66])
except Exception:
self._handle_PDB_exception(
"Invalid or missing B factor", global_line_counter
)
bfactor = 0.0 # PDB uses a default of zero if missing
if not self.is_pqr:
try:
occupancy = float(line[54:60])
except Exception:
self._handle_PDB_exception(
"Invalid or missing occupancy", global_line_counter
)
occupancy = None # Rather than arbitrary zero or one
if occupancy is not None and occupancy < 0:
# TODO - Should this be an error in strict mode?
# self._handle_PDB_exception("Negative occupancy",
# global_line_counter)
# This uses fixed text so the warning occurs once only:
warnings.warn(
"Negative occupancy in one or more atoms",
PDBConstructionWarning,
)
try:
bfactor = float(line[60:66])
except Exception:
self._handle_PDB_exception(
"Invalid or missing B factor", global_line_counter
)
bfactor = 0.0 # PDB uses a default of zero if missing
elif self.is_pqr:
# Attempt to parse charge and radius fields
try:
pqr_charge = float(line[54:62])
except Exception:
self._handle_PDB_exception("Invalid or missing charge",
global_line_counter)
pqr_charge = None # Rather than arbitrary zero or one
try:
radius = float(line[62:70])
except Exception:
self._handle_PDB_exception("Invalid or missing radius",
global_line_counter)
radius = None
if radius is not None and radius < 0:
# In permissive mode raise fatal exception.
message = "Negative atom radius"
self._handle_PDB_exception(message, global_line_counter)
radius = None
segid = line[72:76]
element = line[76:78].strip().upper()
if current_segid != segid:
@ -241,20 +269,39 @@ class PDBParser:
)
except PDBConstructionException as message:
self._handle_PDB_exception(message, global_line_counter)
# init atom
try:
structure_builder.init_atom(
name,
coord,
bfactor,
occupancy,
altloc,
fullname,
serial_number,
element,
)
except PDBConstructionException as message:
self._handle_PDB_exception(message, global_line_counter)
if not self.is_pqr:
# init atom with pdb fiels
try:
structure_builder.init_atom(
name,
coord,
bfactor,
occupancy,
altloc,
fullname,
serial_number,
element,
)
except PDBConstructionException as message:
self._handle_PDB_exception(message, global_line_counter)
elif self.is_pqr:
try:
structure_builder.init_atom(
name,
coord,
pqr_charge,
radius,
altloc,
fullname,
serial_number,
element,
pqr_charge,
radius,
self.is_pqr
)
except PDBConstructionException as message:
self._handle_PDB_exception(message, global_line_counter)
elif record_type == "ANISOU":
anisou = [
float(x)

View File

@ -186,6 +186,9 @@ class StructureBuilder:
fullname,
serial_number=None,
element=None,
pqr_charge=None,
radius=None,
is_pqr=False
):
"""Create a new Atom object.
@ -197,6 +200,9 @@ class StructureBuilder:
- altloc - string, alternative location specifier
- fullname - string, atom name including spaces, e.g. " CA "
- element - string, upper case, e.g. "HG" for mercury
- pqr_charge - float, atom charge (PQR format)
- radius - float, atom radius (PQR format)
- is_pqr - boolean, flag to specify if a .pqr file is being parsed
"""
residue = self.residue
@ -221,9 +227,15 @@ class StructureBuilder:
% (duplicate_fullname, fullname, self.line_counter),
PDBConstructionWarning,
)
self.atom = Atom(
name, coord, b_factor, occupancy, altloc, fullname, serial_number, element
)
if not is_pqr:
self.atom = Atom(
name, coord, b_factor, occupancy, altloc, fullname, serial_number, element
)
elif is_pqr:
self.atom = Atom(
name, coord, None, None, altloc, fullname, serial_number,
element, pqr_charge, radius
)
if altloc != " ":
# The atom is disordered
if residue.has_id(name):

View File

@ -36,6 +36,7 @@ please open an issue on GitHub or mention it on the mailing list.
- Antony Lee <https://github.com/anntzer>
- Anuj Sharma <https://github.com/xulesc>
- Ariel Aptekmann <https://github.com/aralap>
- Artemi Bendandi <https://github.com/artbendandi>
- Barbara Mühlemann <https://github.com/bamueh>
- Bart de Koning <bratdaking gmail>
- Bartek Wilczynski <bartek at domain rezolwenta.eu.org>
@ -165,6 +166,7 @@ please open an issue on GitHub or mention it on the mailing list.
- Konrad Förstner <https://github.com/konrad>
- Konstantin Okonechnikov <k.okonechnikov at domain gmail.com>
- Konstantin Vdovkin <https://github.com/rtf_const>
- Konstantinos Zisis <https://github.com/zisikons>
- Kozo Nishida <https://github.com/kozo2>
- Kristian Davidsen <https://github.com/krdav>
- Kristian Rother <https://github.com/krother>

View File

@ -138,13 +138,33 @@ object, ie. directly from the PDB file:
...
\end{minted}
\subsection{Reading a PQR file}
In order to parse a PQR file, proceed in a similar manner as in the case
of PDB files:
Create a \texttt{PDBParser} object, using the \texttt{is_pqr} flag:
\begin{minted}{pycon}
>>> from Bio.PDB.PDBParser import PDBParser
>>> pqr_parser = PDBParser(PERMISSIVE=1, is_pqr=True)
\end{minted}
The \texttt{is_pqr} flag set to \texttt{True} indicates that the file to be parsed is a PQR file,
and that the parser should read the atomic charge and radius fields for each atom entry. Following the same procedure as for PQR files, a Structure object is then produced, and the PQR file is parsed.
\begin{minted}{pycon}
>>> structure_id = "1fat"
>>> filename = "pdb1fat.ent"
>>> structure = parser.get_structure(structure_id, filename, is_pqr=True)
\end{minted}
\subsection{Reading files in the PDB XML format}
That's not yet supported, but we are definitely planning to support that
in the future (it's not a lot of work). Contact the Biopython developers
via the mailing list if you need this.
\subsection{Writing mmCIF files}
The \texttt{MMCIFIO} class can be used to write structures to the mmCIF file format:
@ -206,6 +226,19 @@ If this is all too complicated for you, the \texttt{Dice} module contains
a handy \texttt{extract} function that writes out all residues in
a chain between a start and end residue.
\subsection{Writing PQR files}
Use the \texttt{PDBIO} class as you would for a PDB file, with the flag \texttt{is_pqr=True}.
The PDBIO methods can be used in the case of PQR files as well.
Example: writing a PQR file
\begin{minted}{pycon}
>>> io = PDBIO(is_pqr=True)
>>> io.set_structure(s)
>>> io.save("out.pdb")
\end{minted}
\subsection{Writing MMTF files}
To write structures to the MMTF file format:

View File

@ -52,6 +52,10 @@ either our original "Biopython License Agreement", or the very similar but
more commonly used "3-Clause BSD License". See the ``LICENSE.rst`` file for
more details.
``PDBParser`` and ``PDBIO`` now support PQR format file parsing and input/
output.
In addition to the mainstream ``x86_64`` aka ``AMD64`` CPU architecture, we
now also test every contribution on the ``ARM64``, ``ppc64le``, and ``s390x``
CPUs under Linux thanks to Travis CI. Further post-release testing done by
@ -73,6 +77,7 @@ possible, especially the following contributors:
- Andrey Raspopov
- Chris Daley (first contribution)
- Chris Rands
- Artemi Bendandi (first contribution)
- Christian Brueffer
- Deepak Khatri
- Ilya Flyamer (first contribution)
@ -80,7 +85,11 @@ possible, especially the following contributors:
- Michael R. Crusoe (first contribution)
- Michiel de Hoon
- Peter Cock
- Chris Daley (first contribution)
- Michiel de Hoon
- Jakub Lipinski (first contribution)
- Sergio Valqui
- Konstantinos Zisis (first contribution)
6 November 2019: Biopython 1.75
===============================

1423
Tests/PQR/1A80.pqr Normal file

File diff suppressed because it is too large Load Diff

171
Tests/test_PQR.py Normal file
View File

@ -0,0 +1,171 @@
# Copyright 2019-2020 by Konstantinos Zisis and Artemi Bendandi. All rights reserved.
#
# Converted by Konstantinos Zisis and Artemi Bendandi from an older unit test
# copyright 2009 by Eric Talevich.
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Unit tests for the PQR parser in Bio.PDB module."""
from __future__ import print_function
from Bio.PDB.PDBParser import PDBParser
import unittest
import warnings
from io import StringIO
from Bio.PDB.PDBExceptions import PDBConstructionException, PDBConstructionWarning
from Bio.PDB.PDBIO import PDBIO
import tempfile
import os
class ParseSimplePQR(unittest.TestCase):
"""Parse a simple PQR entry and check behaviour for various problematic inputs."""
def test_single_input(self):
"""Test if a single ATOM entry correctly parsed."""
# Atom Entry
data = "ATOM 1 N PRO 1 000001 02.000 3.0000 -0.1000 1.0000 N\n"
# Get sole atom of this structure
parser = PDBParser(is_pqr=True) # default initialization
struct = parser.get_structure("test", StringIO(data))
atom = next(struct.get_atoms())
# Check that charge and radius are properly initallized
self.assertEqual(atom.get_charge(), -0.1)
self.assertEqual(atom.get_radius(), 1.0)
self.assertEqual(atom.get_occupancy(), None)
self.assertEqual(atom.get_bfactor(), None)
# Coordinates
for i in range(1, 3):
self.assertEqual(atom.get_coord()[i], i + 1)
def test_bad_xyz(self):
"""Test if bad coordinates exception is raised."""
# Atom Entry
data = "ATOM 1 N PRO 1 00abc1 02.000 3.0000 -0.1000 1.0000 N\n"
# Get sole atom of this structure
parser = PDBParser(is_pqr=True) # default initialization
self.assertRaises(PDBConstructionException,
parser.get_structure, "example", StringIO(data))
def test_bad_charge(self):
"""Test if missing or malformed charge case is handled correctly."""
# Test Entries
malformed = "ATOM 1 N PRO 1 000001 02.000 3.0000 -0.W000 1.0000 N\n"
missing = "ATOM 1 N PRO 1 000001 02.000 3.0000 1.0000 N\n"
# Malformed
parser = PDBParser(PERMISSIVE=True, is_pqr=True) # default initialization
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always", PDBConstructionWarning)
structure = parser.get_structure("test", StringIO(malformed))
atom = next(structure.get_atoms())
self.assertEqual(atom.get_charge(), None)
# Missing
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always", PDBConstructionWarning)
structure = parser.get_structure("test", StringIO(missing))
atom = next(structure.get_atoms())
self.assertEqual(atom.get_charge(), None)
# Test PERMISSIVE mode behaviour
parser = PDBParser(PERMISSIVE=False, is_pqr=True) # default initialization
self.assertRaises(PDBConstructionException,
parser.get_structure, "example", StringIO(malformed))
def test_bad_radius(self):
"""Test if missing, malformed or negative radius case is handled correctly."""
# Test Entries
malformed = "ATOM 1 N PRO 1 000001 02.000 3.0000 -0.1000 1.a00f N\n"
missing = "ATOM 1 N PRO 1 000001 02.000 3.0000 -0.1000 N\n"
negative = "ATOM 1 N PRO 1 000001 02.000 3.0000 -0.1000 -1.0000 N\n"
# Malformed
parser = PDBParser(PERMISSIVE=True, is_pqr=True) # default initialization
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always", PDBConstructionWarning)
structure = parser.get_structure("test", StringIO(malformed))
atom = next(structure.get_atoms())
self.assertEqual(atom.get_radius(), None)
# Missing
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always", PDBConstructionWarning)
structure = parser.get_structure("test", StringIO(missing))
atom = next(structure.get_atoms())
self.assertEqual(atom.get_radius(), None)
# Negative
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always", PDBConstructionWarning)
structure = parser.get_structure("test", StringIO(negative))
atom = next(structure.get_atoms())
self.assertEqual(atom.get_radius(), None)
# Test PERMISSIVE mode behaviour
parser = PDBParser(PERMISSIVE=False, is_pqr=True) # default initialization
self.assertRaises(PDBConstructionException,
parser.get_structure, "example", StringIO(malformed))
self.assertRaises(PDBConstructionException,
parser.get_structure, "example", StringIO(negative))
self.assertRaises(PDBConstructionException,
parser.get_structure, "example", StringIO(missing))
class WriteTest(unittest.TestCase):
"""Test if the PDBIO module correctly exports .pqr files."""
def setUp(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore", PDBConstructionWarning)
# Open a parser in permissive mode and parse an example file
self.pqr_parser = PDBParser(PERMISSIVE=1, is_pqr=True)
self.example_structure = self.pqr_parser.get_structure("example", "PQR/1A80.pqr")
def test_pdbio_write_pqr_structure(self):
"""Write a full structure using PDBIO."""
# Create a PDBIO object in pqr mode with example_structure as an argument
io = PDBIO(is_pqr=True)
io.set_structure(self.example_structure)
# Write to a temporary file
filenumber, filename = tempfile.mkstemp()
os.close(filenumber)
try:
# Export example_structure to a temp file
io.save(filename)
# Parse exported structure
output_struct = self.pqr_parser.get_structure("1a8o", filename)
# Comparisons
self.assertEqual(len(output_struct), len(self.example_structure)) # Structure Length
original_residues = len(list(self.example_structure.get_residues()))
parsed_residues = len(list(output_struct.get_residues()))
self.assertEqual(parsed_residues, original_residues) # Number of Residues
# Atom-wise comparison
original_atoms = self.example_structure.get_atoms()
for atom in output_struct.get_atoms():
self.assertEqual(atom, next(original_atoms))
finally:
os.remove(filename)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)