mirror of
https://github.com/biopython/biopython.git
synced 2025-10-21 06:23:48 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
320 lines
10 KiB
Python
320 lines
10 KiB
Python
# Copyright 2001 by Gavin E. Crooks. All rights reserved.
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""ASTRAL RAF (Rapid Access Format) Sequence Maps.
|
|
|
|
The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
|
|
records (representing the sequence of the molecule used in an experiment) to
|
|
the ATOM records (representing the atoms experimentally observed).
|
|
|
|
This data is derived from the Protein Data Bank CIF files. Known errors in the
|
|
CIF files are corrected manually, with the original PDB file serving as the
|
|
final arbiter in case of discrepancies.
|
|
|
|
Residues are referenced by residue ID. This consists of a the PDB residue
|
|
sequence number (up to 4 digits) and an optional PDB insertion code (an
|
|
ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
|
|
|
|
See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
|
|
|
|
Dictionary `protein_letters_3to1_extended` provides a mapping from the
|
|
3-letter amino acid codes found in PDB files to 1-letter codes. The 3-letter
|
|
codes include chemically modified residues.
|
|
"""
|
|
|
|
from copy import copy
|
|
|
|
from Bio.Data.PDBData import protein_letters_3to1_extended
|
|
from Bio.SCOP.Residues import Residues
|
|
|
|
|
|
def normalize_letters(one_letter_code):
|
|
"""Convert RAF one-letter amino acid codes into IUPAC standard codes.
|
|
|
|
Letters are uppercased, and "." ("Unknown") is converted to "X".
|
|
"""
|
|
if one_letter_code == ".":
|
|
return "X"
|
|
else:
|
|
return one_letter_code.upper()
|
|
|
|
|
|
class SeqMapIndex(dict):
|
|
"""An RAF file index.
|
|
|
|
The RAF file itself is about 50 MB. This index provides rapid, random
|
|
access of RAF records without having to load the entire file into memory.
|
|
|
|
The index key is a concatenation of the PDB ID and chain ID. e.g
|
|
"2drcA", ``"155c_"``. RAF uses an underscore to indicate blank
|
|
chain IDs.
|
|
"""
|
|
|
|
def __init__(self, filename):
|
|
"""Initialize the RAF file index.
|
|
|
|
Arguments:
|
|
- filename -- The file to index
|
|
|
|
"""
|
|
dict.__init__(self)
|
|
self.filename = filename
|
|
|
|
with open(self.filename) as f:
|
|
position = 0
|
|
while True:
|
|
line = f.readline()
|
|
if not line:
|
|
break
|
|
key = line[0:5]
|
|
if key is not None:
|
|
self[key] = position
|
|
position = f.tell()
|
|
|
|
def __getitem__(self, key):
|
|
"""Return an item from the indexed file."""
|
|
position = dict.__getitem__(self, key)
|
|
|
|
with open(self.filename) as f:
|
|
f.seek(position)
|
|
line = f.readline()
|
|
record = SeqMap(line)
|
|
return record
|
|
|
|
def getSeqMap(self, residues):
|
|
"""Get the sequence map for a collection of residues.
|
|
|
|
Arguments:
|
|
- residues -- A Residues instance, or a string that can be
|
|
converted into a Residues instance.
|
|
|
|
"""
|
|
if isinstance(residues, str):
|
|
residues = Residues(residues)
|
|
|
|
pdbid = residues.pdbid
|
|
frags = residues.fragments
|
|
if not frags:
|
|
frags = (("_", "", ""),) # All residues of unnamed chain
|
|
|
|
seqMap = None
|
|
for frag in frags:
|
|
chainid = frag[0]
|
|
if chainid in ["", "-", " ", "_"]:
|
|
chainid = "_"
|
|
id = pdbid + chainid
|
|
|
|
sm = self[id]
|
|
|
|
# Cut out fragment of interest
|
|
start = 0
|
|
end = len(sm.res)
|
|
if frag[1]:
|
|
start = int(sm.index(frag[1], chainid))
|
|
if frag[2]:
|
|
end = int(sm.index(frag[2], chainid)) + 1
|
|
|
|
sm = sm[start:end]
|
|
|
|
if seqMap is None:
|
|
seqMap = sm
|
|
else:
|
|
seqMap += sm
|
|
|
|
return seqMap
|
|
|
|
|
|
class SeqMap:
|
|
"""An ASTRAL RAF (Rapid Access Format) Sequence Map.
|
|
|
|
This is a list like object; You can find the location of particular residues
|
|
with index(), slice this SeqMap into fragments, and glue fragments back
|
|
together with extend().
|
|
|
|
Attributes:
|
|
- pdbid -- The PDB 4 character ID
|
|
- pdb_datestamp -- From the PDB file
|
|
- version -- The RAF format version. e.g. 0.01
|
|
- flags -- RAF flags. (See release notes for more information.)
|
|
- res -- A list of Res objects, one for each residue in this sequence map
|
|
|
|
"""
|
|
|
|
def __init__(self, line=None):
|
|
"""Initialize the class."""
|
|
self.pdbid = ""
|
|
self.pdb_datestamp = ""
|
|
self.version = ""
|
|
self.flags = ""
|
|
self.res = []
|
|
if line:
|
|
self._process(line)
|
|
|
|
def _process(self, line):
|
|
"""Parse a RAF record into a SeqMap object (PRIVATE)."""
|
|
header_len = 38
|
|
|
|
line = line.rstrip() # no trailing whitespace
|
|
|
|
if len(line) < header_len:
|
|
raise ValueError("Incomplete header: " + line)
|
|
|
|
self.pdbid = line[0:4]
|
|
chainid = line[4:5]
|
|
|
|
self.version = line[6:10]
|
|
|
|
# Raf format versions 0.01 and 0.02 are identical for practical purposes
|
|
if self.version != "0.01" and self.version != "0.02":
|
|
raise ValueError("Incompatible RAF version: " + self.version)
|
|
|
|
self.pdb_datestamp = line[14:20]
|
|
self.flags = line[21:27]
|
|
|
|
for i in range(header_len, len(line), 7):
|
|
f = line[i : i + 7]
|
|
if len(f) != 7:
|
|
raise ValueError("Corrupt Field: (" + f + ")")
|
|
r = Res()
|
|
r.chainid = chainid
|
|
r.resid = f[0:5].strip()
|
|
r.atom = normalize_letters(f[5:6])
|
|
r.seqres = normalize_letters(f[6:7])
|
|
|
|
self.res.append(r)
|
|
|
|
def index(self, resid, chainid="_"):
|
|
"""Return the index of the SeqMap for the given resid and chainid."""
|
|
for i in range(len(self.res)):
|
|
if self.res[i].resid == resid and self.res[i].chainid == chainid:
|
|
return i
|
|
raise KeyError("No such residue " + chainid + resid)
|
|
|
|
def __getitem__(self, index):
|
|
"""Extract a single Res object from the SeqMap."""
|
|
if not isinstance(index, slice):
|
|
raise NotImplementedError
|
|
s = copy(self)
|
|
s.res = s.res[index]
|
|
return s
|
|
|
|
def append(self, res):
|
|
"""Append another Res object onto the list of residue mappings."""
|
|
self.res.append(res)
|
|
|
|
def extend(self, other):
|
|
"""Append another SeqMap onto the end of self.
|
|
|
|
Both SeqMaps must have the same PDB ID, PDB datestamp and
|
|
RAF version. The RAF flags are erased if they are inconsistent. This
|
|
may happen when fragments are taken from different chains.
|
|
"""
|
|
if not isinstance(other, SeqMap):
|
|
raise TypeError("Can only extend a SeqMap with a SeqMap.")
|
|
if self.pdbid != other.pdbid:
|
|
raise TypeError("Cannot add fragments from different proteins")
|
|
if self.version != other.version:
|
|
raise TypeError("Incompatible rafs")
|
|
if self.pdb_datestamp != other.pdb_datestamp:
|
|
raise TypeError("Different pdb dates!")
|
|
if self.flags != other.flags:
|
|
self.flags = ""
|
|
self.res += other.res
|
|
|
|
def __iadd__(self, other):
|
|
"""In place addition of SeqMap objects."""
|
|
self.extend(other)
|
|
return self
|
|
|
|
def __add__(self, other):
|
|
"""Addition of SeqMap objects."""
|
|
s = copy(self)
|
|
s.extend(other)
|
|
return s
|
|
|
|
def getAtoms(self, pdb_handle, out_handle):
|
|
"""Extract all relevant ATOM and HETATOM records from a PDB file.
|
|
|
|
The PDB file is scanned for ATOM and HETATOM records. If the
|
|
chain ID, residue ID (seqNum and iCode), and residue type match
|
|
a residue in this sequence map, then the record is echoed to the
|
|
output handle.
|
|
|
|
This is typically used to find the coordinates of a domain, or other
|
|
residue subset.
|
|
|
|
Arguments:
|
|
- pdb_handle -- A handle to the relevant PDB file.
|
|
- out_handle -- All output is written to this file like object.
|
|
|
|
"""
|
|
# This code should be refactored when (if?) biopython gets a PDB parser
|
|
|
|
# The set of residues that I have to find records for.
|
|
resSet = {}
|
|
for r in self.res:
|
|
if r.atom == "X": # Unknown residue type
|
|
continue
|
|
chainid = r.chainid
|
|
if chainid == "_":
|
|
chainid = " "
|
|
resid = r.resid
|
|
resSet[(chainid, resid)] = r
|
|
|
|
resFound = {}
|
|
for line in pdb_handle:
|
|
if line.startswith(("ATOM ", "HETATM")):
|
|
chainid = line[21:22]
|
|
resid = line[22:27].strip()
|
|
key = (chainid, resid)
|
|
if key in resSet:
|
|
res = resSet[key]
|
|
atom_aa = res.atom
|
|
resName = line[17:20]
|
|
if resName in protein_letters_3to1_extended:
|
|
if protein_letters_3to1_extended[resName] == atom_aa:
|
|
out_handle.write(line)
|
|
resFound[key] = res
|
|
|
|
if len(resSet) != len(resFound):
|
|
# for k in resFound:
|
|
# del resSet[k]
|
|
# print(resSet)
|
|
raise RuntimeError(
|
|
"Could not find at least one ATOM or HETATM"
|
|
" record for each and every residue in this"
|
|
" sequence map."
|
|
)
|
|
|
|
|
|
class Res:
|
|
"""A single residue mapping from a RAF record.
|
|
|
|
Attributes:
|
|
- chainid -- A single character chain ID.
|
|
- resid -- The residue ID.
|
|
- atom -- amino acid one-letter code from ATOM records.
|
|
- seqres -- amino acid one-letter code from SEQRES records.
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the class."""
|
|
self.chainid = ""
|
|
self.resid = ""
|
|
self.atom = ""
|
|
self.seqres = ""
|
|
|
|
|
|
def parse(handle):
|
|
"""Iterate over RAF file, giving a SeqMap object for each line.
|
|
|
|
Arguments:
|
|
- handle -- file-like object.
|
|
|
|
"""
|
|
for line in handle:
|
|
yield SeqMap(line)
|