mirror of
https://github.com/google-deepmind/alphafold3.git
synced 2025-10-20 13:23:47 +08:00
Add support for protein/DNA/RNA/ligand descriptions
Suggested in https://github.com/google-deepmind/alphafold3/issues/496. PiperOrigin-RevId: 802081348 Change-Id: I666466fd6a770b6f4a891ed33e6a26651d600c4a
This commit is contained in:
committed by
Copybara-Service
parent
a2b03dab51
commit
7b816f4035
@ -117,7 +117,7 @@ The top-level structure of the input JSON is:
|
|||||||
"userCCD": "...", # Optional, mutually exclusive with userCCDPath.
|
"userCCD": "...", # Optional, mutually exclusive with userCCDPath.
|
||||||
"userCCDPath": "...", # Optional, mutually exclusive with userCCD.
|
"userCCDPath": "...", # Optional, mutually exclusive with userCCD.
|
||||||
"dialect": "alphafold3", # Required.
|
"dialect": "alphafold3", # Required.
|
||||||
"version": 3 # Required.
|
"version": 4 # Required.
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -166,6 +166,8 @@ The top-level `version` field (for the `alphafold3` dialect) can be either `1`,
|
|||||||
added fields `unpairedMsaPath`, `pairedMsaPath`, and `mmcifPath`.
|
added fields `unpairedMsaPath`, `pairedMsaPath`, and `mmcifPath`.
|
||||||
* `3`: added the option of specifying external user-provided CCD using newly
|
* `3`: added the option of specifying external user-provided CCD using newly
|
||||||
added field `userCCDPath`.
|
added field `userCCDPath`.
|
||||||
|
* `4`: added the option of specifying textual `description` of protein chains,
|
||||||
|
RNA chains, DNA chains, or ligands.
|
||||||
|
|
||||||
## Sequences
|
## Sequences
|
||||||
|
|
||||||
@ -186,6 +188,7 @@ Specifies a single protein chain.
|
|||||||
{"ptmType": "HY3", "ptmPosition": 1},
|
{"ptmType": "HY3", "ptmPosition": 1},
|
||||||
{"ptmType": "P1L", "ptmPosition": 5}
|
{"ptmType": "P1L", "ptmPosition": 5}
|
||||||
],
|
],
|
||||||
|
"description": ..., # Optional.
|
||||||
"unpairedMsa": ..., # Mutually exclusive with unpairedMsaPath.
|
"unpairedMsa": ..., # Mutually exclusive with unpairedMsaPath.
|
||||||
"unpairedMsaPath": ..., # Mutually exclusive with unpairedMsa.
|
"unpairedMsaPath": ..., # Mutually exclusive with unpairedMsa.
|
||||||
"pairedMsa": ..., # Mutually exclusive with pairedMsaPath.
|
"pairedMsa": ..., # Mutually exclusive with pairedMsaPath.
|
||||||
@ -207,6 +210,9 @@ The fields specify the following:
|
|||||||
post-translational modifications. Each modification is specified using its
|
post-translational modifications. Each modification is specified using its
|
||||||
CCD code and 1-based residue position. In the example above, we see that the
|
CCD code and 1-based residue position. In the example above, we see that the
|
||||||
first residue won't be a proline (`P`) but instead `HY3`.
|
first residue won't be a proline (`P`) but instead `HY3`.
|
||||||
|
* `description: str`: An optional textual description of this chain. This
|
||||||
|
field will is only used in the JSON format and serves as a comment
|
||||||
|
describing this chain.
|
||||||
* `unpairedMsa: str`: An optional multiple sequence alignment for this chain.
|
* `unpairedMsa: str`: An optional multiple sequence alignment for this chain.
|
||||||
This is specified using the A3M format (equivalent to the FASTA format, but
|
This is specified using the A3M format (equivalent to the FASTA format, but
|
||||||
also allows gaps denoted by the hyphen `-` character). See more details
|
also allows gaps denoted by the hyphen `-` character). See more details
|
||||||
@ -239,6 +245,7 @@ Specifies a single RNA chain.
|
|||||||
{"modificationType": "2MG", "basePosition": 1},
|
{"modificationType": "2MG", "basePosition": 1},
|
||||||
{"modificationType": "5MC", "basePosition": 4}
|
{"modificationType": "5MC", "basePosition": 4}
|
||||||
],
|
],
|
||||||
|
"description": ..., # Optional.
|
||||||
"unpairedMsa": ..., # Mutually exclusive with unpairedMsaPath.
|
"unpairedMsa": ..., # Mutually exclusive with unpairedMsaPath.
|
||||||
"unpairedMsaPath": ... # Mutually exclusive with unpairedMsa.
|
"unpairedMsaPath": ... # Mutually exclusive with unpairedMsa.
|
||||||
}
|
}
|
||||||
@ -255,6 +262,9 @@ The fields specify the following:
|
|||||||
letters `A`, `C`, `G`, `U`.
|
letters `A`, `C`, `G`, `U`.
|
||||||
* `modifications: list[RnaModification]`: An optional list of modifications.
|
* `modifications: list[RnaModification]`: An optional list of modifications.
|
||||||
Each modification is specified using its CCD code and 1-based base position.
|
Each modification is specified using its CCD code and 1-based base position.
|
||||||
|
* `description: str`: An optional textual description of this chain. This
|
||||||
|
field will is only used in the JSON format and serves as a comment
|
||||||
|
describing this chain.
|
||||||
* `unpairedMsa: str`: An optional multiple sequence alignment for this chain.
|
* `unpairedMsa: str`: An optional multiple sequence alignment for this chain.
|
||||||
This is specified using the A3M format. See more details below.
|
This is specified using the A3M format. See more details below.
|
||||||
* `unpairedMsaPath: str`: An optional path to a file that contains the
|
* `unpairedMsaPath: str`: An optional path to a file that contains the
|
||||||
@ -275,7 +285,8 @@ Specifies a single DNA chain.
|
|||||||
"modifications": [
|
"modifications": [
|
||||||
{"modificationType": "6OG", "basePosition": 1},
|
{"modificationType": "6OG", "basePosition": 1},
|
||||||
{"modificationType": "6MA", "basePosition": 2}
|
{"modificationType": "6MA", "basePosition": 2}
|
||||||
]
|
],
|
||||||
|
"description": ... # Optional.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@ -290,6 +301,9 @@ The fields specify the following:
|
|||||||
letters `A`, `C`, `G`, `T`.
|
letters `A`, `C`, `G`, `T`.
|
||||||
* `modifications: list[DnaModification]`: An optional list of modifications.
|
* `modifications: list[DnaModification]`: An optional list of modifications.
|
||||||
Each modification is specified using its CCD code and 1-based base position.
|
Each modification is specified using its CCD code and 1-based base position.
|
||||||
|
* `description: str`: An optional textual description of this chain. This
|
||||||
|
field will is only used in the JSON format and serves as a comment
|
||||||
|
describing this chain.
|
||||||
|
|
||||||
### Ligands
|
### Ligands
|
||||||
|
|
||||||
@ -314,19 +328,22 @@ Specifies a single ligand. Ligands can be specified using 3 different formats:
|
|||||||
{
|
{
|
||||||
"ligand": {
|
"ligand": {
|
||||||
"id": ["G", "H", "I"],
|
"id": ["G", "H", "I"],
|
||||||
"ccdCodes": ["ATP"]
|
"ccdCodes": ["ATP"],
|
||||||
|
"description": ... # Optional.
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"ligand": {
|
"ligand": {
|
||||||
"id": "J",
|
"id": "J",
|
||||||
"ccdCodes": ["LIG-1337"]
|
"ccdCodes": ["LIG-1337"],
|
||||||
|
"description": ... # Optional.
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"ligand": {
|
"ligand": {
|
||||||
"id": "K",
|
"id": "K",
|
||||||
"smiles": "CC(=O)OC1C[NH+]2CCC1CC2"
|
"smiles": "CC(=O)OC1C[NH+]2CCC1CC2",
|
||||||
|
"description": ... # Optional.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@ -342,6 +359,9 @@ The fields specify the following:
|
|||||||
[user-provided CCD](#user-provided-ccd).
|
[user-provided CCD](#user-provided-ccd).
|
||||||
* `smiles: str`: An optional string defining the ligand using a SMILES string.
|
* `smiles: str`: An optional string defining the ligand using a SMILES string.
|
||||||
The SMILES string must be correctly JSON-escaped.
|
The SMILES string must be correctly JSON-escaped.
|
||||||
|
* `description: str`: An optional textual description of this chain. This
|
||||||
|
field will is only used in the JSON format and serves as a comment
|
||||||
|
describing this ligand.
|
||||||
|
|
||||||
Each ligand may be specified using CCD codes or SMILES but not both, i.e. for a
|
Each ligand may be specified using CCD codes or SMILES but not both, i.e. for a
|
||||||
given ligand, the `ccdCodes` and `smiles` fields are mutually exclusive.
|
given ligand, the `ccdCodes` and `smiles` fields are mutually exclusive.
|
||||||
@ -919,6 +939,7 @@ certain fields and the sequences are not biologically meaningful.
|
|||||||
{"ptmType": "HY3", "ptmPosition": 1},
|
{"ptmType": "HY3", "ptmPosition": 1},
|
||||||
{"ptmType": "P1L", "ptmPosition": 5}
|
{"ptmType": "P1L", "ptmPosition": 5}
|
||||||
],
|
],
|
||||||
|
"description": "10-residue protein with 2 modifications",
|
||||||
"unpairedMsa": ...,
|
"unpairedMsa": ...,
|
||||||
"pairedMsa": ""
|
"pairedMsa": ""
|
||||||
}
|
}
|
||||||
@ -982,7 +1003,6 @@ certain fields and the sequences are not biologically meaningful.
|
|||||||
],
|
],
|
||||||
"userCCD": ...,
|
"userCCD": ...,
|
||||||
"dialect": "alphafold3",
|
"dialect": "alphafold3",
|
||||||
"version": 3
|
"version": 4
|
||||||
}
|
}
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -36,7 +36,7 @@ import zstandard as zstd
|
|||||||
BondAtomId: TypeAlias = tuple[str, int, str]
|
BondAtomId: TypeAlias = tuple[str, int, str]
|
||||||
|
|
||||||
JSON_DIALECT: Final[str] = 'alphafold3'
|
JSON_DIALECT: Final[str] = 'alphafold3'
|
||||||
JSON_VERSIONS: Final[tuple[int, ...]] = (1, 2, 3)
|
JSON_VERSIONS: Final[tuple[int, ...]] = (1, 2, 3, 4)
|
||||||
JSON_VERSION: Final[int] = JSON_VERSIONS[-1]
|
JSON_VERSION: Final[int] = JSON_VERSIONS[-1]
|
||||||
|
|
||||||
ALPHAFOLDSERVER_JSON_DIALECT: Final[str] = 'alphafoldserver'
|
ALPHAFOLDSERVER_JSON_DIALECT: Final[str] = 'alphafoldserver'
|
||||||
@ -127,6 +127,7 @@ class ProteinChain:
|
|||||||
'_id',
|
'_id',
|
||||||
'_sequence',
|
'_sequence',
|
||||||
'_ptms',
|
'_ptms',
|
||||||
|
'_description',
|
||||||
'_paired_msa',
|
'_paired_msa',
|
||||||
'_unpaired_msa',
|
'_unpaired_msa',
|
||||||
'_templates',
|
'_templates',
|
||||||
@ -138,6 +139,7 @@ class ProteinChain:
|
|||||||
id: str, # pylint: disable=redefined-builtin
|
id: str, # pylint: disable=redefined-builtin
|
||||||
sequence: str,
|
sequence: str,
|
||||||
ptms: Sequence[tuple[str, int]],
|
ptms: Sequence[tuple[str, int]],
|
||||||
|
description: str | None = None,
|
||||||
paired_msa: str | None = None,
|
paired_msa: str | None = None,
|
||||||
unpaired_msa: str | None = None,
|
unpaired_msa: str | None = None,
|
||||||
templates: Sequence[Template] | None = None,
|
templates: Sequence[Template] | None = None,
|
||||||
@ -149,6 +151,7 @@ class ProteinChain:
|
|||||||
sequence: The amino acid sequence of the chain.
|
sequence: The amino acid sequence of the chain.
|
||||||
ptms: A list of tuples containing the post-translational modification type
|
ptms: A list of tuples containing the post-translational modification type
|
||||||
and the (1-based) residue index where the modification is applied.
|
and the (1-based) residue index where the modification is applied.
|
||||||
|
description: An optional textual description of the protein chain.
|
||||||
paired_msa: Paired A3M-formatted MSA for this chain. This MSA is not
|
paired_msa: Paired A3M-formatted MSA for this chain. This MSA is not
|
||||||
deduplicated and will be used to compute paired features. If None, this
|
deduplicated and will be used to compute paired features. If None, this
|
||||||
field is unset and must be filled in by the data pipeline before
|
field is unset and must be filled in by the data pipeline before
|
||||||
@ -175,6 +178,7 @@ class ProteinChain:
|
|||||||
self._id = id
|
self._id = id
|
||||||
self._sequence = sequence
|
self._sequence = sequence
|
||||||
self._ptms = tuple(ptms)
|
self._ptms = tuple(ptms)
|
||||||
|
self._description = description
|
||||||
self._paired_msa = paired_msa
|
self._paired_msa = paired_msa
|
||||||
self._unpaired_msa = unpaired_msa
|
self._unpaired_msa = unpaired_msa
|
||||||
self._templates = tuple(templates) if templates is not None else None
|
self._templates = tuple(templates) if templates is not None else None
|
||||||
@ -198,6 +202,10 @@ class ProteinChain:
|
|||||||
def ptms(self) -> Sequence[tuple[str, int]]:
|
def ptms(self) -> Sequence[tuple[str, int]]:
|
||||||
return self._ptms
|
return self._ptms
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str | None:
|
||||||
|
return self._description
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def paired_msa(self) -> str | None:
|
def paired_msa(self) -> str | None:
|
||||||
return self._paired_msa
|
return self._paired_msa
|
||||||
@ -218,6 +226,7 @@ class ProteinChain:
|
|||||||
self._id == other._id
|
self._id == other._id
|
||||||
and self._sequence == other._sequence
|
and self._sequence == other._sequence
|
||||||
and self._ptms == other._ptms
|
and self._ptms == other._ptms
|
||||||
|
and self._description == other._description
|
||||||
and self._paired_msa == other._paired_msa
|
and self._paired_msa == other._paired_msa
|
||||||
and self._unpaired_msa == other._unpaired_msa
|
and self._unpaired_msa == other._unpaired_msa
|
||||||
and self._templates == other._templates
|
and self._templates == other._templates
|
||||||
@ -228,6 +237,7 @@ class ProteinChain:
|
|||||||
self._id,
|
self._id,
|
||||||
self._sequence,
|
self._sequence,
|
||||||
self._ptms,
|
self._ptms,
|
||||||
|
self._description,
|
||||||
self._paired_msa,
|
self._paired_msa,
|
||||||
self._unpaired_msa,
|
self._unpaired_msa,
|
||||||
self._templates,
|
self._templates,
|
||||||
@ -238,6 +248,7 @@ class ProteinChain:
|
|||||||
return hash((
|
return hash((
|
||||||
self._sequence,
|
self._sequence,
|
||||||
self._ptms,
|
self._ptms,
|
||||||
|
self._description,
|
||||||
self._paired_msa,
|
self._paired_msa,
|
||||||
self._unpaired_msa,
|
self._unpaired_msa,
|
||||||
self._templates,
|
self._templates,
|
||||||
@ -298,6 +309,7 @@ class ProteinChain:
|
|||||||
'id',
|
'id',
|
||||||
'sequence',
|
'sequence',
|
||||||
'modifications',
|
'modifications',
|
||||||
|
'description',
|
||||||
'unpairedMsa',
|
'unpairedMsa',
|
||||||
'unpairedMsaPath',
|
'unpairedMsaPath',
|
||||||
'pairedMsa',
|
'pairedMsa',
|
||||||
@ -368,6 +380,7 @@ class ProteinChain:
|
|||||||
id=seq_id or json_dict['id'],
|
id=seq_id or json_dict['id'],
|
||||||
sequence=sequence,
|
sequence=sequence,
|
||||||
ptms=ptms,
|
ptms=ptms,
|
||||||
|
description=json_dict.get('description', None),
|
||||||
paired_msa=paired_msa,
|
paired_msa=paired_msa,
|
||||||
unpaired_msa=unpaired_msa,
|
unpaired_msa=unpaired_msa,
|
||||||
templates=templates,
|
templates=templates,
|
||||||
@ -400,6 +413,8 @@ class ProteinChain:
|
|||||||
'pairedMsa': self._paired_msa,
|
'pairedMsa': self._paired_msa,
|
||||||
'templates': templates,
|
'templates': templates,
|
||||||
}
|
}
|
||||||
|
if self._description is not None:
|
||||||
|
contents['description'] = self._description
|
||||||
return {'protein': contents}
|
return {'protein': contents}
|
||||||
|
|
||||||
def to_ccd_sequence(self) -> Sequence[str]:
|
def to_ccd_sequence(self) -> Sequence[str]:
|
||||||
@ -418,6 +433,7 @@ class ProteinChain:
|
|||||||
id=self.id,
|
id=self.id,
|
||||||
sequence=self._sequence,
|
sequence=self._sequence,
|
||||||
ptms=self._ptms,
|
ptms=self._ptms,
|
||||||
|
description=self._description,
|
||||||
unpaired_msa=self._unpaired_msa or '',
|
unpaired_msa=self._unpaired_msa or '',
|
||||||
paired_msa=self._paired_msa or '',
|
paired_msa=self._paired_msa or '',
|
||||||
templates=self._templates or [],
|
templates=self._templates or [],
|
||||||
@ -427,7 +443,13 @@ class ProteinChain:
|
|||||||
class RnaChain:
|
class RnaChain:
|
||||||
"""RNA chain input."""
|
"""RNA chain input."""
|
||||||
|
|
||||||
__slots__ = ('_id', '_sequence', '_modifications', '_unpaired_msa')
|
__slots__ = (
|
||||||
|
'_id',
|
||||||
|
'_sequence',
|
||||||
|
'_modifications',
|
||||||
|
'_description',
|
||||||
|
'_unpaired_msa',
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -435,6 +457,7 @@ class RnaChain:
|
|||||||
id: str, # pylint: disable=redefined-builtin
|
id: str, # pylint: disable=redefined-builtin
|
||||||
sequence: str,
|
sequence: str,
|
||||||
modifications: Sequence[tuple[str, int]],
|
modifications: Sequence[tuple[str, int]],
|
||||||
|
description: str | None = None,
|
||||||
unpaired_msa: str | None = None,
|
unpaired_msa: str | None = None,
|
||||||
):
|
):
|
||||||
"""Initializes a single strand RNA chain input.
|
"""Initializes a single strand RNA chain input.
|
||||||
@ -444,6 +467,7 @@ class RnaChain:
|
|||||||
sequence: The RNA sequence of the chain.
|
sequence: The RNA sequence of the chain.
|
||||||
modifications: A list of tuples containing the modification type and the
|
modifications: A list of tuples containing the modification type and the
|
||||||
(1-based) residue index where the modification is applied.
|
(1-based) residue index where the modification is applied.
|
||||||
|
description: An optional textual description of the RNA chain.
|
||||||
unpaired_msa: Unpaired A3M-formatted MSA for this chain. This will be
|
unpaired_msa: Unpaired A3M-formatted MSA for this chain. This will be
|
||||||
deduplicated and used to compute unpaired features. If None, this field
|
deduplicated and used to compute unpaired features. If None, this field
|
||||||
is unset and must be filled in by the data pipeline before
|
is unset and must be filled in by the data pipeline before
|
||||||
@ -463,6 +487,7 @@ class RnaChain:
|
|||||||
self._sequence = sequence
|
self._sequence = sequence
|
||||||
# Use hashable container for modifications.
|
# Use hashable container for modifications.
|
||||||
self._modifications = tuple(modifications)
|
self._modifications = tuple(modifications)
|
||||||
|
self._description = description
|
||||||
self._unpaired_msa = unpaired_msa
|
self._unpaired_msa = unpaired_msa
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -484,6 +509,10 @@ class RnaChain:
|
|||||||
def modifications(self) -> Sequence[tuple[str, int]]:
|
def modifications(self) -> Sequence[tuple[str, int]]:
|
||||||
return self._modifications
|
return self._modifications
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str | None:
|
||||||
|
return self._description
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def unpaired_msa(self) -> str | None:
|
def unpaired_msa(self) -> str | None:
|
||||||
return self._unpaired_msa
|
return self._unpaired_msa
|
||||||
@ -496,17 +525,27 @@ class RnaChain:
|
|||||||
self._id == other._id
|
self._id == other._id
|
||||||
and self._sequence == other._sequence
|
and self._sequence == other._sequence
|
||||||
and self._modifications == other._modifications
|
and self._modifications == other._modifications
|
||||||
|
and self._description == other._description
|
||||||
and self._unpaired_msa == other._unpaired_msa
|
and self._unpaired_msa == other._unpaired_msa
|
||||||
)
|
)
|
||||||
|
|
||||||
def __hash__(self) -> int:
|
def __hash__(self) -> int:
|
||||||
return hash(
|
return hash((
|
||||||
(self._id, self._sequence, self._modifications, self._unpaired_msa)
|
self._id,
|
||||||
)
|
self._sequence,
|
||||||
|
self._modifications,
|
||||||
|
self._description,
|
||||||
|
self._unpaired_msa,
|
||||||
|
))
|
||||||
|
|
||||||
def hash_without_id(self) -> int:
|
def hash_without_id(self) -> int:
|
||||||
"""Returns a hash ignoring the ID - useful for deduplication."""
|
"""Returns a hash ignoring the ID - useful for deduplication."""
|
||||||
return hash((self._sequence, self._modifications, self._unpaired_msa))
|
return hash((
|
||||||
|
self._sequence,
|
||||||
|
self._modifications,
|
||||||
|
self._description,
|
||||||
|
self._unpaired_msa,
|
||||||
|
))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_alphafoldserver_dict(
|
def from_alphafoldserver_dict(
|
||||||
@ -532,7 +571,14 @@ class RnaChain:
|
|||||||
json_dict = json_dict['rna']
|
json_dict = json_dict['rna']
|
||||||
_validate_keys(
|
_validate_keys(
|
||||||
json_dict.keys(),
|
json_dict.keys(),
|
||||||
{'id', 'sequence', 'unpairedMsa', 'unpairedMsaPath', 'modifications'},
|
{
|
||||||
|
'id',
|
||||||
|
'sequence',
|
||||||
|
'modifications',
|
||||||
|
'description',
|
||||||
|
'unpairedMsa',
|
||||||
|
'unpairedMsaPath',
|
||||||
|
},
|
||||||
)
|
)
|
||||||
sequence = json_dict['sequence']
|
sequence = json_dict['sequence']
|
||||||
modifications = [
|
modifications = [
|
||||||
@ -559,6 +605,7 @@ class RnaChain:
|
|||||||
id=seq_id or json_dict['id'],
|
id=seq_id or json_dict['id'],
|
||||||
sequence=sequence,
|
sequence=sequence,
|
||||||
modifications=modifications,
|
modifications=modifications,
|
||||||
|
description=json_dict.get('description', None),
|
||||||
unpaired_msa=unpaired_msa,
|
unpaired_msa=unpaired_msa,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -575,6 +622,8 @@ class RnaChain:
|
|||||||
],
|
],
|
||||||
'unpairedMsa': self._unpaired_msa,
|
'unpairedMsa': self._unpaired_msa,
|
||||||
}
|
}
|
||||||
|
if self._description is not None:
|
||||||
|
contents['description'] = self._description
|
||||||
return {'rna': contents}
|
return {'rna': contents}
|
||||||
|
|
||||||
def to_ccd_sequence(self) -> Sequence[str]:
|
def to_ccd_sequence(self) -> Sequence[str]:
|
||||||
@ -600,7 +649,7 @@ class RnaChain:
|
|||||||
class DnaChain:
|
class DnaChain:
|
||||||
"""Single strand DNA chain input."""
|
"""Single strand DNA chain input."""
|
||||||
|
|
||||||
__slots__ = ('_id', '_sequence', '_modifications')
|
__slots__ = ('_id', '_sequence', '_modifications', '_description')
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -608,6 +657,7 @@ class DnaChain:
|
|||||||
id: str, # pylint: disable=redefined-builtin
|
id: str, # pylint: disable=redefined-builtin
|
||||||
sequence: str,
|
sequence: str,
|
||||||
modifications: Sequence[tuple[str, int]],
|
modifications: Sequence[tuple[str, int]],
|
||||||
|
description: str | None = None,
|
||||||
):
|
):
|
||||||
"""Initializes a single strand DNA chain input.
|
"""Initializes a single strand DNA chain input.
|
||||||
|
|
||||||
@ -616,6 +666,7 @@ class DnaChain:
|
|||||||
sequence: The DNA sequence of the chain.
|
sequence: The DNA sequence of the chain.
|
||||||
modifications: A list of tuples containing the modification type and the
|
modifications: A list of tuples containing the modification type and the
|
||||||
(1-based) residue index where the modification is applied.
|
(1-based) residue index where the modification is applied.
|
||||||
|
description: An optional textual description of the DNA chain.
|
||||||
"""
|
"""
|
||||||
if not all(res.isalpha() for res in sequence):
|
if not all(res.isalpha() for res in sequence):
|
||||||
raise ValueError(f'DNA must contain only letters, got "{sequence}"')
|
raise ValueError(f'DNA must contain only letters, got "{sequence}"')
|
||||||
@ -630,6 +681,7 @@ class DnaChain:
|
|||||||
self._sequence = sequence
|
self._sequence = sequence
|
||||||
# Use hashable container for modifications.
|
# Use hashable container for modifications.
|
||||||
self._modifications = tuple(modifications)
|
self._modifications = tuple(modifications)
|
||||||
|
self._description = description
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def id(self) -> str:
|
def id(self) -> str:
|
||||||
@ -646,6 +698,10 @@ class DnaChain:
|
|||||||
for r in self.to_ccd_sequence()
|
for r in self.to_ccd_sequence()
|
||||||
])
|
])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str | None:
|
||||||
|
return self._description
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
return len(self._sequence)
|
return len(self._sequence)
|
||||||
|
|
||||||
@ -654,17 +710,20 @@ class DnaChain:
|
|||||||
self._id == other._id
|
self._id == other._id
|
||||||
and self._sequence == other._sequence
|
and self._sequence == other._sequence
|
||||||
and self._modifications == other._modifications
|
and self._modifications == other._modifications
|
||||||
|
and self._description == other._description
|
||||||
)
|
)
|
||||||
|
|
||||||
def __hash__(self) -> int:
|
def __hash__(self) -> int:
|
||||||
return hash((self._id, self._sequence, self._modifications))
|
return hash(
|
||||||
|
(self._id, self._sequence, self._modifications, self._description)
|
||||||
|
)
|
||||||
|
|
||||||
def modifications(self) -> Sequence[tuple[str, int]]:
|
def modifications(self) -> Sequence[tuple[str, int]]:
|
||||||
return self._modifications
|
return self._modifications
|
||||||
|
|
||||||
def hash_without_id(self) -> int:
|
def hash_without_id(self) -> int:
|
||||||
"""Returns a hash ignoring the ID - useful for deduplication."""
|
"""Returns a hash ignoring the ID - useful for deduplication."""
|
||||||
return hash((self._sequence, self._modifications))
|
return hash((self._sequence, self._modifications, self._description))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_alphafoldserver_dict(
|
def from_alphafoldserver_dict(
|
||||||
@ -685,7 +744,9 @@ class DnaChain:
|
|||||||
) -> Self:
|
) -> Self:
|
||||||
"""Constructs DnaChain from the AlphaFold JSON dict."""
|
"""Constructs DnaChain from the AlphaFold JSON dict."""
|
||||||
json_dict = json_dict['dna']
|
json_dict = json_dict['dna']
|
||||||
_validate_keys(json_dict.keys(), {'id', 'sequence', 'modifications'})
|
_validate_keys(
|
||||||
|
json_dict.keys(), {'id', 'sequence', 'modifications', 'description'}
|
||||||
|
)
|
||||||
sequence = json_dict['sequence']
|
sequence = json_dict['sequence']
|
||||||
modifications = [
|
modifications = [
|
||||||
(mod['modificationType'], mod['basePosition'])
|
(mod['modificationType'], mod['basePosition'])
|
||||||
@ -695,6 +756,7 @@ class DnaChain:
|
|||||||
id=seq_id or json_dict['id'],
|
id=seq_id or json_dict['id'],
|
||||||
sequence=sequence,
|
sequence=sequence,
|
||||||
modifications=modifications,
|
modifications=modifications,
|
||||||
|
description=json_dict.get('description', None),
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_dict(
|
def to_dict(
|
||||||
@ -709,6 +771,8 @@ class DnaChain:
|
|||||||
for mod in self._modifications
|
for mod in self._modifications
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
if self._description is not None:
|
||||||
|
contents['description'] = self._description
|
||||||
return {'dna': contents}
|
return {'dna': contents}
|
||||||
|
|
||||||
def to_ccd_sequence(self) -> Sequence[str]:
|
def to_ccd_sequence(self) -> Sequence[str]:
|
||||||
@ -734,11 +798,13 @@ class Ligand:
|
|||||||
a bond linking these components should be added to the bonded_atom_pairs
|
a bond linking these components should be added to the bonded_atom_pairs
|
||||||
Input field.
|
Input field.
|
||||||
smiles: The SMILES representation of the ligand.
|
smiles: The SMILES representation of the ligand.
|
||||||
|
description: An optional textual description of the ligand.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
id: str
|
id: str
|
||||||
ccd_ids: Sequence[str] | None = None
|
ccd_ids: Sequence[str] | None = None
|
||||||
smiles: str | None = None
|
smiles: str | None = None
|
||||||
|
description: str | None = None
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if (self.ccd_ids is None) == (self.smiles is None):
|
if (self.ccd_ids is None) == (self.smiles is None):
|
||||||
@ -761,7 +827,7 @@ class Ligand:
|
|||||||
|
|
||||||
def hash_without_id(self) -> int:
|
def hash_without_id(self) -> int:
|
||||||
"""Returns a hash ignoring the ID - useful for deduplication."""
|
"""Returns a hash ignoring the ID - useful for deduplication."""
|
||||||
return hash((self.ccd_ids, self.smiles))
|
return hash((self.ccd_ids, self.smiles, self.description))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_alphafoldserver_dict(
|
def from_alphafoldserver_dict(
|
||||||
@ -783,7 +849,9 @@ class Ligand:
|
|||||||
) -> Self:
|
) -> Self:
|
||||||
"""Constructs Ligand from the AlphaFold JSON dict."""
|
"""Constructs Ligand from the AlphaFold JSON dict."""
|
||||||
json_dict = json_dict['ligand']
|
json_dict = json_dict['ligand']
|
||||||
_validate_keys(json_dict.keys(), {'id', 'ccdCodes', 'smiles'})
|
_validate_keys(
|
||||||
|
json_dict.keys(), {'id', 'ccdCodes', 'smiles', 'description'}
|
||||||
|
)
|
||||||
if json_dict.get('ccdCodes') and json_dict.get('smiles'):
|
if json_dict.get('ccdCodes') and json_dict.get('smiles'):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Ligand cannot have both CCD code and SMILES set at the same time, '
|
'Ligand cannot have both CCD code and SMILES set at the same time, '
|
||||||
@ -797,9 +865,17 @@ class Ligand:
|
|||||||
'CCD codes must be a list of strings, got '
|
'CCD codes must be a list of strings, got '
|
||||||
f'{type(ccd_codes).__name__} instead: {ccd_codes}'
|
f'{type(ccd_codes).__name__} instead: {ccd_codes}'
|
||||||
)
|
)
|
||||||
return cls(id=seq_id or json_dict['id'], ccd_ids=ccd_codes)
|
return cls(
|
||||||
|
id=seq_id or json_dict['id'],
|
||||||
|
ccd_ids=ccd_codes,
|
||||||
|
description=json_dict.get('description', None),
|
||||||
|
)
|
||||||
elif 'smiles' in json_dict:
|
elif 'smiles' in json_dict:
|
||||||
return cls(id=seq_id or json_dict['id'], smiles=json_dict['smiles'])
|
return cls(
|
||||||
|
id=seq_id or json_dict['id'],
|
||||||
|
smiles=json_dict['smiles'],
|
||||||
|
description=json_dict.get('description', None),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'Unknown ligand type: {json_dict}')
|
raise ValueError(f'Unknown ligand type: {json_dict}')
|
||||||
|
|
||||||
@ -812,6 +888,8 @@ class Ligand:
|
|||||||
contents['ccdCodes'] = self.ccd_ids
|
contents['ccdCodes'] = self.ccd_ids
|
||||||
if self.smiles is not None:
|
if self.smiles is not None:
|
||||||
contents['smiles'] = self.smiles
|
contents['smiles'] = self.smiles
|
||||||
|
if self.description is not None:
|
||||||
|
contents['description'] = self.description
|
||||||
return {'ligand': contents}
|
return {'ligand': contents}
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user