mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
- Added private alternative constructor SeqRecord._from_validated(...). This avoids type and length checks in cases when these have already been done - Modified SeqRecord seq and letter_annotation attributes to use decorator-based formatting, and included more type annotations - Modified sam parser to explicitly construct a new SeqRecord to remove the deepcopy bottleneck - Modified FastaIterator, QualityIO, and SffIO parsers to use _from_validated to redundant checks and make code cleaner - Added explicit checks in SeqRecord methods for cases when Seq is None, and specific error messages for those cases
1549 lines
60 KiB
Python
1549 lines
60 KiB
Python
# Copyright 2000-2002 Andrew Dalke. All rights reserved.
|
|
# Copyright 2002-2004 Brad Chapman. All rights reserved.
|
|
# Copyright 2006-2023 by Peter Cock. All rights reserved.
|
|
# Copyright 2020 by Michael R. Crusoe
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Represent a Sequence Record, a sequence with annotation."""
|
|
|
|
# NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL
|
|
# In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes
|
|
# need to be in sync (this is the BioSQL "Database SeqRecord").
|
|
import numbers
|
|
import warnings
|
|
from io import StringIO
|
|
from typing import Any
|
|
from typing import cast
|
|
from collections.abc import Iterator
|
|
from typing import NoReturn
|
|
from typing import Optional
|
|
from typing import overload
|
|
from collections.abc import Sequence
|
|
from typing import TYPE_CHECKING
|
|
from typing import Union
|
|
|
|
from Bio import BiopythonDeprecationWarning
|
|
from Bio import StreamModeError
|
|
from Bio.Seq import MutableSeq
|
|
from Bio.Seq import Seq
|
|
from Bio.Seq import UndefinedSequenceError
|
|
|
|
if TYPE_CHECKING:
|
|
from Bio.SeqFeature import SeqFeature
|
|
|
|
_NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest."
|
|
|
|
|
|
class _RestrictedDict(dict[str, Sequence[Any]]):
|
|
"""Dict which only allows sequences of given length as values (PRIVATE).
|
|
|
|
This simple subclass of the Python dictionary is used in the SeqRecord
|
|
object for holding per-letter-annotations. This class is intended to
|
|
prevent simple errors by only allowing python sequences (e.g. lists,
|
|
strings and tuples) to be stored, and only if their length matches that
|
|
expected (the length of the SeqRecord's seq object). It cannot however
|
|
prevent the entries being edited in situ (for example appending entries
|
|
to a list).
|
|
|
|
>>> x = _RestrictedDict(5)
|
|
>>> x["test"] = "hello"
|
|
>>> x
|
|
{'test': 'hello'}
|
|
|
|
Adding entries which don't have the expected length are blocked:
|
|
|
|
>>> x["test"] = "hello world"
|
|
Traceback (most recent call last):
|
|
...
|
|
TypeError: Any per-letter annotation should be a Python sequence (list, tuple or string) of the same length as the biological sequence, here 5.
|
|
|
|
The expected length is stored as a private attribute,
|
|
|
|
>>> x._length
|
|
5
|
|
|
|
In order that the SeqRecord (and other objects using this class) can be
|
|
pickled, for example for use in the multiprocessing library, we need to
|
|
be able to pickle the restricted dictionary objects.
|
|
|
|
Using the default protocol, which is 3 on Python 3,
|
|
|
|
>>> import pickle
|
|
>>> y = pickle.loads(pickle.dumps(x))
|
|
>>> y
|
|
{'test': 'hello'}
|
|
>>> y._length
|
|
5
|
|
|
|
Using the highest protocol, which is 4 on Python 3,
|
|
|
|
>>> import pickle
|
|
>>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL))
|
|
>>> z
|
|
{'test': 'hello'}
|
|
>>> z._length
|
|
5
|
|
"""
|
|
|
|
def __init__(self, length: int) -> None:
|
|
"""Create an EMPTY restricted dictionary."""
|
|
dict.__init__(self)
|
|
self._length = int(length)
|
|
|
|
def __setitem__(self, key: str, value: Sequence[Any]) -> None:
|
|
# The check hasattr(self, "_length") is to cope with pickle protocol 2
|
|
# I couldn't seem to avoid this with __getstate__ and __setstate__
|
|
if (
|
|
not hasattr(value, "__len__")
|
|
or not hasattr(value, "__getitem__")
|
|
or (hasattr(self, "_length") and len(value) != self._length)
|
|
):
|
|
raise TypeError(
|
|
"Any per-letter annotation should be a Python sequence "
|
|
"(list, tuple or string) of the same length as the "
|
|
f"biological sequence, here {self._length}."
|
|
)
|
|
dict.__setitem__(self, key, value)
|
|
|
|
def update(self, new_dict):
|
|
# Force this to go via our strict __setitem__ method
|
|
for key, value in new_dict.items():
|
|
self[key] = value
|
|
|
|
|
|
class SeqRecord:
|
|
"""A SeqRecord object holds a sequence and information about it.
|
|
|
|
Main attributes:
|
|
- id - Identifier such as a locus tag (string)
|
|
- seq - The sequence itself (Seq object or similar)
|
|
|
|
Additional attributes:
|
|
- name - Sequence name, e.g. gene name (string)
|
|
- description - Additional text (string)
|
|
- dbxrefs - List of database cross references (list of strings)
|
|
- features - Any (sub)features defined (list of SeqFeature objects)
|
|
- annotations - Further information about the whole sequence (dictionary).
|
|
Most entries are strings, or lists of strings.
|
|
- letter_annotations - Per letter/symbol annotation (restricted
|
|
dictionary). This holds Python sequences (lists, strings
|
|
or tuples) whose length matches that of the sequence.
|
|
A typical use would be to hold a list of integers
|
|
representing sequencing quality scores, or a string
|
|
representing the secondary structure.
|
|
|
|
You will typically use Bio.SeqIO to read in sequences from files as
|
|
SeqRecord objects. However, you may want to create your own SeqRecord
|
|
objects directly (see the __init__ method for further details):
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
|
|
... id="YP_025292.1", name="HokC",
|
|
... description="toxic membrane protein")
|
|
>>> print(record)
|
|
ID: YP_025292.1
|
|
Name: HokC
|
|
Description: toxic membrane protein
|
|
Number of features: 0
|
|
Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
|
|
|
|
If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
|
|
for this. For the special case where you want the SeqRecord turned into
|
|
a string in a particular file format there is a format method which uses
|
|
Bio.SeqIO internally:
|
|
|
|
>>> print(record.format("fasta"))
|
|
>YP_025292.1 toxic membrane protein
|
|
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
|
|
<BLANKLINE>
|
|
|
|
You can also do things like slicing a SeqRecord, checking its length, etc
|
|
|
|
>>> len(record)
|
|
44
|
|
>>> edited = record[:10] + record[11:]
|
|
>>> print(edited.seq)
|
|
MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
|
|
>>> print(record.seq)
|
|
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
|
|
|
|
"""
|
|
|
|
_AnnotationsDictValue = Union[str, int]
|
|
_AnnotationsDict = dict[str, _AnnotationsDictValue]
|
|
|
|
annotations: _AnnotationsDict
|
|
dbxrefs: list[str]
|
|
_per_letter_annotations: Optional[_RestrictedDict]
|
|
|
|
def __init__(
|
|
self,
|
|
seq: Optional[Union["Seq", "MutableSeq"]],
|
|
id: Optional[str] = "<unknown id>",
|
|
name: str = "<unknown name>",
|
|
description: str = "<unknown description>",
|
|
dbxrefs: Optional[list[str]] = None,
|
|
features: Optional[list["SeqFeature"]] = None,
|
|
annotations: Optional[_AnnotationsDict] = None,
|
|
letter_annotations: Optional[dict[str, Sequence[Any]]] = None,
|
|
) -> None:
|
|
"""Create a SeqRecord.
|
|
|
|
Arguments:
|
|
- seq - Sequence, required (Seq or MutableSeq)
|
|
- id - Sequence identifier, recommended (string)
|
|
- name - Sequence name, optional (string)
|
|
- description - Sequence description, optional (string)
|
|
- dbxrefs - Database cross references, optional (list of strings)
|
|
- features - Any (sub)features, optional (list of SeqFeature objects)
|
|
- annotations - Dictionary of annotations for the whole sequence
|
|
- letter_annotations - Dictionary of per-letter-annotations, values
|
|
should be strings, list or tuples of the same length as the full
|
|
sequence.
|
|
|
|
You will typically use Bio.SeqIO to read in sequences from files as
|
|
SeqRecord objects. However, you may want to create your own SeqRecord
|
|
objects directly.
|
|
|
|
Note that while an id is optional, we strongly recommend you supply a
|
|
unique id string for each record. This is especially important
|
|
if you wish to write your sequences to a file.
|
|
|
|
You can create a 'blank' SeqRecord object, and then populate the
|
|
attributes later.
|
|
"""
|
|
if id is not None and not isinstance(id, str):
|
|
# Lots of existing code uses id=None... this may be a bad idea.
|
|
raise TypeError("id argument should be a string")
|
|
if not isinstance(name, str):
|
|
raise TypeError("name argument should be a string")
|
|
if not isinstance(description, str):
|
|
raise TypeError("description argument should be a string")
|
|
|
|
if seq is not None and not isinstance(seq, (Seq, MutableSeq)):
|
|
warnings.warn(
|
|
"Using a string as the sequence is deprecated and will raise a"
|
|
" TypeError in future. It has been converted to a Seq object.",
|
|
BiopythonDeprecationWarning,
|
|
)
|
|
seq = Seq(seq)
|
|
|
|
self._seq = seq
|
|
self.id = id
|
|
self.name = name
|
|
self.description = description
|
|
|
|
# database cross references (for the whole sequence)
|
|
if dbxrefs is None:
|
|
dbxrefs = []
|
|
elif not isinstance(dbxrefs, list):
|
|
raise TypeError("dbxrefs argument should be a list (of strings)")
|
|
self.dbxrefs = dbxrefs
|
|
|
|
# annotations about the whole sequence
|
|
if annotations is None:
|
|
annotations = {}
|
|
elif not isinstance(annotations, dict):
|
|
raise TypeError("annotations argument must be a dict or None")
|
|
self.annotations = annotations
|
|
|
|
self._per_letter_annotations = None
|
|
if letter_annotations is not None:
|
|
# This will be handled via the property set function, which will
|
|
# turn this into a _RestrictedDict and thus ensure all the values
|
|
# in the dict are the right length
|
|
self.letter_annotations = letter_annotations
|
|
|
|
# annotations about parts of the sequence
|
|
if features is None:
|
|
features = []
|
|
elif not isinstance(features, list):
|
|
raise TypeError(
|
|
"features argument should be a list (of SeqFeature objects)"
|
|
)
|
|
self.features = features
|
|
|
|
@property
|
|
def letter_annotations(self) -> dict[str, Sequence[Any]]:
|
|
"""Dictionary of per-letter-annotation for the sequence.
|
|
|
|
For example, this can hold quality scores used in FASTQ or QUAL files.
|
|
Consider this example using Bio.SeqIO to read in an example Solexa
|
|
variant FASTQ file as a SeqRecord:
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
|
|
>>> print("%s %s" % (record.id, record.seq))
|
|
slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
|
|
>>> print(list(record.letter_annotations))
|
|
['solexa_quality']
|
|
>>> print(record.letter_annotations["solexa_quality"])
|
|
[40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
|
|
|
|
The letter_annotations get sliced automatically if you slice the
|
|
parent SeqRecord, for example taking the last ten bases:
|
|
|
|
>>> sub_record = record[-10:]
|
|
>>> print("%s %s" % (sub_record.id, sub_record.seq))
|
|
slxa_0001_1_0001_01 ACGTNNNNNN
|
|
>>> print(sub_record.letter_annotations["solexa_quality"])
|
|
[4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
|
|
|
|
Any python sequence (i.e. list, tuple or string) can be recorded in
|
|
the SeqRecord's letter_annotations dictionary as long as the length
|
|
matches that of the SeqRecord's sequence. e.g.
|
|
|
|
>>> len(sub_record.letter_annotations)
|
|
1
|
|
>>> sub_record.letter_annotations["dummy"] = "abcdefghij"
|
|
>>> len(sub_record.letter_annotations)
|
|
2
|
|
|
|
You can delete entries from the letter_annotations dictionary as usual:
|
|
|
|
>>> del sub_record.letter_annotations["solexa_quality"]
|
|
>>> sub_record.letter_annotations
|
|
{'dummy': 'abcdefghij'}
|
|
|
|
You can completely clear the dictionary easily as follows:
|
|
|
|
>>> sub_record.letter_annotations = {}
|
|
>>> sub_record.letter_annotations
|
|
{}
|
|
|
|
Note that if replacing the record's sequence with a sequence of a
|
|
different length you must first clear the letter_annotations dict.
|
|
"""
|
|
if self._per_letter_annotations is None:
|
|
length = 0 if self.seq is None else len(self.seq)
|
|
self._per_letter_annotations = _RestrictedDict(length=length)
|
|
return self._per_letter_annotations
|
|
|
|
# TODO - Just make this a read only property?
|
|
@letter_annotations.setter
|
|
def letter_annotations(self, value: dict[str, Sequence[Any]]) -> None:
|
|
if not isinstance(value, dict):
|
|
raise TypeError(
|
|
"The per-letter-annotations should be a (restricted) dictionary."
|
|
)
|
|
# Turn this into a restricted-dictionary (and check the entries)
|
|
length = 0 if self.seq is None else len(self.seq)
|
|
if any(len(val) != length for val in value.values()):
|
|
raise ValueError(
|
|
f"The per-letter-annotations have the same length as the sequence, but found: \n {','.join([f'{key}={val}' for key, val in value.items() if len(val) != length])}"
|
|
)
|
|
if self._per_letter_annotations is None:
|
|
self._per_letter_annotations = _RestrictedDict(length=length)
|
|
else:
|
|
self._per_letter_annotations.clear()
|
|
dict.update(self._per_letter_annotations, value) # type: ignore
|
|
|
|
@property
|
|
def seq(self) -> Optional[Union["Seq", "MutableSeq"]]:
|
|
"""The sequence itself, as a Seq or MutableSeq object."""
|
|
return self._seq
|
|
|
|
@seq.setter
|
|
def seq(self, value: Union["Seq", "MutableSeq"]) -> None:
|
|
# Adding this here for users who are not type-checking their code.
|
|
if not isinstance(value, (Seq, MutableSeq)):
|
|
warnings.warn(
|
|
"Using a string as the sequence is deprecated and will raise a"
|
|
" TypeError in future. It has been converted to a Seq object.",
|
|
BiopythonDeprecationWarning,
|
|
)
|
|
value = Seq(value)
|
|
|
|
# TODO - Add a deprecation warning that the seq should be write only?
|
|
if self._per_letter_annotations:
|
|
if len(self) != len(value):
|
|
# TODO - Make this a warning? Silently empty the dictionary?
|
|
raise ValueError("You must empty the letter annotations first!")
|
|
else:
|
|
# Leave the existing per letter annotations unchanged:
|
|
self._seq = value
|
|
else:
|
|
self._seq = value
|
|
# Reset the (empty) letter annotations dict with new length:
|
|
length = 0 if self.seq is None else len(self.seq)
|
|
self._per_letter_annotations = _RestrictedDict(length=length)
|
|
|
|
@classmethod
|
|
def _from_validated(
|
|
cls,
|
|
seq: Optional[Union[Seq, MutableSeq]],
|
|
id: Optional[str] = "<unknown id>",
|
|
name: str = "<unknown name>",
|
|
description: str = "<unknown description>",
|
|
dbxrefs: Optional[list[str]] = None,
|
|
features: Optional[list["SeqFeature"]] = None,
|
|
annotations: Optional[dict[str, Union[str, int]]] = None,
|
|
letter_annotations: Optional[dict[str, Sequence]] = None,
|
|
) -> "SeqRecord":
|
|
"""Faster constructor for post-validated data like copies or validated parsed data"""
|
|
|
|
if cls is not SeqRecord:
|
|
# If we subclassed, we'll default to that class's initializer just to be careful
|
|
return cls(
|
|
seq,
|
|
id,
|
|
name,
|
|
description,
|
|
dbxrefs,
|
|
features,
|
|
annotations,
|
|
letter_annotations,
|
|
)
|
|
|
|
inst = cls.__new__(cls)
|
|
|
|
inst._seq = seq
|
|
inst.id = id
|
|
inst.name = name
|
|
inst.description = description
|
|
if dbxrefs is None:
|
|
dbxrefs = []
|
|
inst.dbxrefs = dbxrefs
|
|
if features is None:
|
|
features = []
|
|
inst.features = features
|
|
if annotations is None:
|
|
annotations = {}
|
|
inst.annotations = annotations
|
|
|
|
inst._per_letter_annotations = None
|
|
if letter_annotations is not None:
|
|
length = 0 if seq is None else len(seq)
|
|
inst._per_letter_annotations = _RestrictedDict(length=length)
|
|
dict.update(inst._per_letter_annotations, letter_annotations) # type: ignore
|
|
return inst
|
|
|
|
@overload
|
|
def __getitem__(self, index: int) -> str: ...
|
|
|
|
@overload
|
|
def __getitem__(self, index: slice) -> "SeqRecord": ...
|
|
|
|
def __getitem__(self, index):
|
|
"""Return a sub-sequence or an individual letter.
|
|
|
|
Slicing, e.g. my_record[5:10], returns a new SeqRecord for
|
|
that sub-sequence with some annotation preserved as follows:
|
|
|
|
* The name, id and description are kept as-is.
|
|
* Any per-letter-annotations are sliced to match the requested
|
|
sub-sequence.
|
|
* Unless a stride is used, all those features which fall fully
|
|
within the subsequence are included (with their locations
|
|
adjusted accordingly). If you want to preserve any truncated
|
|
features (e.g. GenBank/EMBL source features), you must
|
|
explicitly add them to the new SeqRecord yourself.
|
|
* With the exception of any molecule type, the annotations
|
|
dictionary and the dbxrefs list are not used for the new
|
|
SeqRecord, as in general they may not apply to the
|
|
subsequence. If you want to preserve them, you must explicitly
|
|
copy them to the new SeqRecord yourself.
|
|
|
|
Using an integer index, e.g. my_record[5] is shorthand for
|
|
extracting that letter from the sequence, my_record.seq[5].
|
|
|
|
For example, consider this short protein and its secondary
|
|
structure as encoded by the PDB (e.g. H for alpha helices),
|
|
plus a simple feature for its histidine self phosphorylation
|
|
site:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
|
>>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
|
|
... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR"),
|
|
... id="1JOY", name="EnvZ",
|
|
... description="Homodimeric domain of EnvZ from E. coli")
|
|
>>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
|
|
>>> rec.features.append(SeqFeature(SimpleLocation(20, 21),
|
|
... type = "Site"))
|
|
|
|
Now let's have a quick look at the full record,
|
|
|
|
>>> print(rec)
|
|
ID: 1JOY
|
|
Name: EnvZ
|
|
Description: Homodimeric domain of EnvZ from E. coli
|
|
Number of features: 1
|
|
Per letter annotation for: secondary_structure
|
|
Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR')
|
|
>>> rec.letter_annotations["secondary_structure"]
|
|
' S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT '
|
|
>>> print(rec.features[0].location)
|
|
[20:21]
|
|
|
|
Now let's take a sub sequence, here chosen as the first (fractured)
|
|
alpha helix which includes the histidine phosphorylation site:
|
|
|
|
>>> sub = rec[11:41]
|
|
>>> print(sub)
|
|
ID: 1JOY
|
|
Name: EnvZ
|
|
Description: Homodimeric domain of EnvZ from E. coli
|
|
Number of features: 1
|
|
Per letter annotation for: secondary_structure
|
|
Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD')
|
|
>>> sub.letter_annotations["secondary_structure"]
|
|
'HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH'
|
|
>>> print(sub.features[0].location)
|
|
[9:10]
|
|
|
|
You can also of course omit the start or end values, for
|
|
example to get the first ten letters only:
|
|
|
|
>>> print(rec[:10])
|
|
ID: 1JOY
|
|
Name: EnvZ
|
|
Description: Homodimeric domain of EnvZ from E. coli
|
|
Number of features: 0
|
|
Per letter annotation for: secondary_structure
|
|
Seq('MAAGVKQLAD')
|
|
|
|
Or for the last ten letters:
|
|
|
|
>>> print(rec[-10:])
|
|
ID: 1JOY
|
|
Name: EnvZ
|
|
Description: Homodimeric domain of EnvZ from E. coli
|
|
Number of features: 0
|
|
Per letter annotation for: secondary_structure
|
|
Seq('IIEQFIDYLR')
|
|
|
|
If you omit both, then you get a copy of the original record (although
|
|
lacking the annotations and dbxrefs):
|
|
|
|
>>> print(rec[:])
|
|
ID: 1JOY
|
|
Name: EnvZ
|
|
Description: Homodimeric domain of EnvZ from E. coli
|
|
Number of features: 1
|
|
Per letter annotation for: secondary_structure
|
|
Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR')
|
|
|
|
Finally, indexing with a simple integer is shorthand for pulling out
|
|
that letter from the sequence directly:
|
|
|
|
>>> rec[5]
|
|
'K'
|
|
>>> rec.seq[5]
|
|
'K'
|
|
"""
|
|
if isinstance(index, numbers.Integral):
|
|
# NOTE - The sequence level annotation like the id, name, etc
|
|
# do not really apply to a single character. However, should
|
|
# we try and expose any per-letter-annotation here? If so how?
|
|
if self.seq is None:
|
|
raise ValueError(
|
|
"Seq in SeqRecord is None, it doesn't support indexing"
|
|
)
|
|
return self.seq[index]
|
|
elif isinstance(index, slice):
|
|
if self.seq is None:
|
|
raise ValueError("Seq in SeqRecord is None, we cannot slice it")
|
|
parent_length = len(self)
|
|
try:
|
|
from BioSQL.BioSeq import DBSeqRecord
|
|
|
|
biosql_available = True
|
|
except ImportError:
|
|
biosql_available = False
|
|
|
|
if biosql_available and isinstance(self, DBSeqRecord):
|
|
answer = SeqRecord._from_validated(
|
|
self.seq[index],
|
|
id=self.id,
|
|
name=self.name,
|
|
description=self.description,
|
|
)
|
|
else:
|
|
answer = self._from_validated(
|
|
self.seq[index],
|
|
id=self.id,
|
|
name=self.name,
|
|
description=self.description,
|
|
)
|
|
# TODO - The description may no longer apply.
|
|
# It would be safer to change it to something
|
|
# generic like "edited" or the default value.
|
|
|
|
# Don't copy the annotation dict and dbxefs list,
|
|
# they may not apply to a subsequence.
|
|
# answer.annotations = dict(self.annotations.items())
|
|
# answer.dbxrefs = self.dbxrefs[:]
|
|
# TODO - Review this in light of adding SeqRecord objects?
|
|
|
|
if "molecule_type" in self.annotations:
|
|
# This will still apply, and we need it for GenBank/EMBL etc output
|
|
answer.annotations["molecule_type"] = self.annotations["molecule_type"]
|
|
|
|
# TODO - Cope with strides by generating ambiguous locations?
|
|
start, stop, step = index.indices(parent_length)
|
|
if step == 1:
|
|
# Select relevant features, add them with shifted locations
|
|
# assert str(self.seq)[index] == str(self.seq)[start:stop]
|
|
for f in self.features:
|
|
if f.location.ref or f.location.ref_db:
|
|
# TODO - Implement this (with lots of tests)?
|
|
import warnings
|
|
|
|
warnings.warn(
|
|
"When slicing SeqRecord objects, any "
|
|
"SeqFeature referencing other sequences (e.g. "
|
|
"from segmented GenBank records) are ignored."
|
|
)
|
|
continue
|
|
try:
|
|
if start <= f.location.start and f.location.end <= stop:
|
|
answer.features.append(f._shift(-start))
|
|
except TypeError:
|
|
# Will fail on UnknownPosition
|
|
pass
|
|
|
|
# Slice all the values to match the sliced sequence
|
|
# (this should also work with strides, even negative strides):
|
|
for key, value in self.letter_annotations.items():
|
|
answer.letter_annotations[key] = value[index]
|
|
|
|
return answer
|
|
raise ValueError("Invalid index")
|
|
|
|
def __iter__(self) -> Iterator[str]:
|
|
"""Iterate over the letters in the sequence.
|
|
|
|
For example, using Bio.SeqIO to read in a protein FASTA file:
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta")
|
|
>>> for amino in record:
|
|
... print(amino)
|
|
... if amino == "L": break
|
|
X
|
|
A
|
|
G
|
|
L
|
|
>>> print(record.seq[3])
|
|
L
|
|
|
|
This is just a shortcut for iterating over the sequence directly:
|
|
|
|
>>> for amino in record.seq:
|
|
... print(amino)
|
|
... if amino == "L": break
|
|
X
|
|
A
|
|
G
|
|
L
|
|
>>> print(record.seq[3])
|
|
L
|
|
|
|
Note that this does not facilitate iteration together with any
|
|
per-letter-annotation. However, you can achieve that using the
|
|
python zip function on the record (or its sequence) and the relevant
|
|
per-letter-annotation:
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
|
|
>>> print("%s %s" % (rec.id, rec.seq))
|
|
slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
|
|
>>> print(list(rec.letter_annotations))
|
|
['solexa_quality']
|
|
>>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]):
|
|
... if qual > 35:
|
|
... print("%s %i" % (nuc, qual))
|
|
A 40
|
|
C 39
|
|
G 38
|
|
T 37
|
|
A 36
|
|
|
|
You may agree that using zip(rec.seq, ...) is more explicit than using
|
|
zip(rec, ...) as shown above.
|
|
"""
|
|
if self._seq is None:
|
|
raise ValueError("Seq in SeqRecord is None, can't iterate over it")
|
|
return iter(self._seq)
|
|
|
|
def __contains__(self, char: str) -> bool:
|
|
"""Implement the 'in' keyword, searches the sequence.
|
|
|
|
e.g.
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
|
|
>>> "GAATTC" in record
|
|
False
|
|
>>> "AAA" in record
|
|
True
|
|
|
|
This essentially acts as a proxy for using "in" on the sequence:
|
|
|
|
>>> "GAATTC" in record.seq
|
|
False
|
|
>>> "AAA" in record.seq
|
|
True
|
|
|
|
Note that you can also use Seq objects as the query,
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> Seq("AAA") in record
|
|
True
|
|
|
|
See also the Seq object's __contains__ method.
|
|
"""
|
|
if self._seq is None:
|
|
raise ValueError("Seq in SeqRecord is None, can't convert to bytes")
|
|
return char in self._seq
|
|
|
|
def __bytes__(self) -> bytes:
|
|
if self._seq is None:
|
|
raise ValueError("Seq in SeqRecord is None, can't convert to bytes")
|
|
return bytes(self._seq)
|
|
|
|
def __str__(self) -> str:
|
|
"""Return a human readable summary of the record and its annotation (string).
|
|
|
|
The python built in function str works by calling the object's __str__
|
|
method. e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
|
|
... id="YP_025292.1", name="HokC",
|
|
... description="toxic membrane protein, small")
|
|
>>> print(str(record))
|
|
ID: YP_025292.1
|
|
Name: HokC
|
|
Description: toxic membrane protein, small
|
|
Number of features: 0
|
|
Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
|
|
|
|
In this example you don't actually need to call str explicitly, as the
|
|
print command does this automatically:
|
|
|
|
>>> print(record)
|
|
ID: YP_025292.1
|
|
Name: HokC
|
|
Description: toxic membrane protein, small
|
|
Number of features: 0
|
|
Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
|
|
|
|
Note that long sequences are shown truncated.
|
|
"""
|
|
lines: list[str] = []
|
|
if self.id:
|
|
lines.append(f"ID: {self.id}")
|
|
if self.name:
|
|
lines.append(f"Name: {self.name}")
|
|
if self.description:
|
|
lines.append(f"Description: {self.description}")
|
|
if self.dbxrefs:
|
|
lines.append("Database cross-references: " + ", ".join(self.dbxrefs))
|
|
lines.append(f"Number of features: {len(self.features)}")
|
|
for a in self.annotations:
|
|
lines.append(f"/{a}={self.annotations[a]!s}")
|
|
if self.letter_annotations:
|
|
lines.append(
|
|
"Per letter annotation for: " + ", ".join(self.letter_annotations)
|
|
)
|
|
if self.seq is not None:
|
|
try:
|
|
bytes(self.seq)
|
|
except UndefinedSequenceError:
|
|
lines.append(f"Undefined sequence of length {len(self.seq)}")
|
|
else:
|
|
# Don't want to include the entire sequence
|
|
seq = repr(self.seq)
|
|
lines.append(seq)
|
|
else:
|
|
lines.append("Missing Sequence (None)")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def __repr__(self) -> str:
|
|
"""Return a concise summary of the record for debugging (string).
|
|
|
|
The python built in function repr works by calling the object's __repr__
|
|
method. e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
|
|
... "GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
|
|
... "SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
|
|
... "PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF"),
|
|
... id="NP_418483.1", name="b4059",
|
|
... description="ssDNA-binding protein",
|
|
... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
|
|
>>> print(repr(rec))
|
|
SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF'), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
|
|
|
|
At the python prompt you can also use this shorthand:
|
|
|
|
>>> rec
|
|
SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF'), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
|
|
|
|
Note that long sequences are shown truncated. Also note that any
|
|
annotations, letter_annotations and features are not shown (as they
|
|
would lead to a very long string).
|
|
"""
|
|
return (
|
|
f"{self.__class__.__name__}(seq={self.seq!r}, id={self.id!r},"
|
|
f" name={self.name!r}, description={self.description!r},"
|
|
f" dbxrefs={self.dbxrefs!r})"
|
|
)
|
|
|
|
def format(self, format: str) -> str:
|
|
r"""Return the record as a string in the specified file format.
|
|
|
|
The format should be a lower case string supported as an output
|
|
format by Bio.SeqIO, which is used to turn the SeqRecord into a
|
|
string. e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
|
|
... id="YP_025292.1", name="HokC",
|
|
... description="toxic membrane protein")
|
|
>>> record.format("fasta")
|
|
'>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n'
|
|
>>> print(record.format("fasta"))
|
|
>YP_025292.1 toxic membrane protein
|
|
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
|
|
<BLANKLINE>
|
|
|
|
The Python print function automatically appends a new line, meaning
|
|
in this example a blank line is shown. If you look at the string
|
|
representation you can see there is a trailing new line (shown as
|
|
slash n) which is important when writing to a file or if
|
|
concatenating multiple sequence strings together.
|
|
|
|
Note that this method will NOT work on every possible file format
|
|
supported by Bio.SeqIO (e.g. some are for multiple sequences only,
|
|
and binary formats are not supported).
|
|
"""
|
|
# See also the __format__ method
|
|
return self.__format__(format)
|
|
|
|
def __format__(self, format_spec: str) -> str:
|
|
r"""Return the record as a string in the specified file format.
|
|
|
|
This method supports the Python format() function and f-strings.
|
|
The format_spec should be a lower case string supported by
|
|
Bio.SeqIO as a text output file format. Requesting a binary file
|
|
format raises a ValueError. e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
|
|
... id="YP_025292.1", name="HokC",
|
|
... description="toxic membrane protein")
|
|
...
|
|
>>> format(record, "fasta")
|
|
'>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n'
|
|
>>> print(f"Here is {record.id} in FASTA format:\n{record:fasta}")
|
|
Here is YP_025292.1 in FASTA format:
|
|
>YP_025292.1 toxic membrane protein
|
|
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
|
|
<BLANKLINE>
|
|
|
|
See also the SeqRecord's format() method.
|
|
"""
|
|
if not format_spec:
|
|
# Follow python convention and default to using __str__
|
|
return str(self)
|
|
from Bio import SeqIO
|
|
|
|
# Easy case, can call string-building function directly
|
|
if format_spec in SeqIO._FormatToString:
|
|
return SeqIO._FormatToString[format_spec](self)
|
|
|
|
# Harder case, make a temp handle instead
|
|
handle = StringIO()
|
|
try:
|
|
SeqIO.write(self, handle, format_spec)
|
|
except StreamModeError:
|
|
raise ValueError(
|
|
"Binary format %s cannot be used with SeqRecord format method"
|
|
% format_spec
|
|
) from None
|
|
return handle.getvalue()
|
|
|
|
def __len__(self) -> int:
|
|
"""Return the length of the sequence.
|
|
|
|
For example, using Bio.SeqIO to read in a FASTA nucleotide file:
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
|
|
>>> len(record)
|
|
309
|
|
>>> len(record.seq)
|
|
309
|
|
"""
|
|
return len(self._seq) if self._seq is not None else 0
|
|
|
|
def __lt__(self, other: Any) -> NoReturn:
|
|
"""Define the less-than operand (not implemented)."""
|
|
raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
|
|
|
|
def __le__(self, other: Any) -> NoReturn:
|
|
"""Define the less-than-or-equal-to operand (not implemented)."""
|
|
raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
|
|
|
|
def __eq__(self, other: object) -> NoReturn:
|
|
"""Define the equal-to operand (not implemented)."""
|
|
raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
|
|
|
|
def __ne__(self, other: object) -> NoReturn:
|
|
"""Define the not-equal-to operand (not implemented)."""
|
|
raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
|
|
|
|
def __gt__(self, other: Any) -> NoReturn:
|
|
"""Define the greater-than operand (not implemented)."""
|
|
raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
|
|
|
|
def __ge__(self, other: Any) -> NoReturn:
|
|
"""Define the greater-than-or-equal-to operand (not implemented)."""
|
|
raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
|
|
|
|
def __bool__(self) -> bool:
|
|
"""Boolean value of an instance of this class (True).
|
|
|
|
This behaviour is for backwards compatibility, since until the
|
|
__len__ method was added, a SeqRecord always evaluated as True.
|
|
|
|
Note that in comparison, a Seq object will evaluate to False if it
|
|
has a zero length sequence.
|
|
|
|
WARNING: The SeqRecord may in future evaluate to False when its
|
|
sequence is of zero length (in order to better match the Seq
|
|
object behaviour)!
|
|
"""
|
|
return True
|
|
|
|
def __add__(
|
|
self, other: Union["SeqRecord", "Seq", "MutableSeq", str]
|
|
) -> "SeqRecord":
|
|
"""Add another sequence or string to this sequence.
|
|
|
|
The other sequence can be a SeqRecord object, a Seq object (or
|
|
similar, e.g. a MutableSeq) or a plain Python string. If you add
|
|
a plain string or a Seq (like) object, the new SeqRecord will simply
|
|
have this appended to the existing data. However, any per letter
|
|
annotation will be lost:
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
|
|
>>> print("%s %s" % (record.id, record.seq))
|
|
slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
|
|
>>> print(list(record.letter_annotations))
|
|
['solexa_quality']
|
|
|
|
>>> new = record + "ACT"
|
|
>>> print("%s %s" % (new.id, new.seq))
|
|
slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT
|
|
>>> print(list(new.letter_annotations))
|
|
[]
|
|
|
|
The new record will attempt to combine the annotation, but for any
|
|
ambiguities (e.g. different names) it defaults to omitting that
|
|
annotation.
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> with open("GenBank/pBAD30.gb") as handle:
|
|
... plasmid = SeqIO.read(handle, "gb")
|
|
>>> print("%s %i" % (plasmid.id, len(plasmid)))
|
|
pBAD30 4923
|
|
|
|
Now let's cut the plasmid into two pieces, and join them back up the
|
|
other way round (i.e. shift the starting point on this plasmid, have
|
|
a look at the annotated features in the original file to see why this
|
|
particular split point might make sense):
|
|
|
|
>>> left = plasmid[:3765]
|
|
>>> right = plasmid[3765:]
|
|
>>> new = right + left
|
|
>>> print("%s %i" % (new.id, len(new)))
|
|
pBAD30 4923
|
|
>>> str(new.seq) == str(right.seq + left.seq)
|
|
True
|
|
>>> len(new.features) == len(left.features) + len(right.features)
|
|
True
|
|
|
|
When we add the left and right SeqRecord objects, their annotation
|
|
is all consistent, so it is all conserved in the new SeqRecord:
|
|
|
|
>>> new.id == left.id == right.id == plasmid.id
|
|
True
|
|
>>> new.name == left.name == right.name == plasmid.name
|
|
True
|
|
>>> new.description == plasmid.description
|
|
True
|
|
>>> new.annotations == left.annotations == right.annotations
|
|
True
|
|
>>> new.letter_annotations == plasmid.letter_annotations
|
|
True
|
|
>>> new.dbxrefs == left.dbxrefs == right.dbxrefs
|
|
True
|
|
|
|
However, we should point out that when we sliced the SeqRecord,
|
|
any annotations dictionary or dbxrefs list entries were lost.
|
|
You can explicitly copy them like this:
|
|
|
|
>>> new.annotations = plasmid.annotations.copy()
|
|
>>> new.dbxrefs = plasmid.dbxrefs[:]
|
|
"""
|
|
|
|
if self._seq is None:
|
|
raise ValueError("Left operand seq=None, can't add")
|
|
|
|
if not isinstance(other, SeqRecord):
|
|
# Assume it is a string or a Seq.
|
|
# Note can't transfer any per-letter-annotations
|
|
return type(self)(
|
|
self._seq + other,
|
|
id=self.id,
|
|
name=self.name,
|
|
description=self.description,
|
|
features=self.features[:],
|
|
annotations=self.annotations.copy(),
|
|
dbxrefs=self.dbxrefs[:],
|
|
)
|
|
|
|
if other._seq is None:
|
|
raise ValueError("Right SeqRecord has seq=None, can't add")
|
|
|
|
# Adding two SeqRecord objects... must merge annotation
|
|
answer = self._from_validated(
|
|
self._seq + other._seq, features=self.features[:], dbxrefs=self.dbxrefs[:]
|
|
)
|
|
# Will take all the features and all the db cross refs,
|
|
length = len(self)
|
|
for f in other.features:
|
|
answer.features.append(f._shift(length))
|
|
del length
|
|
for ref in other.dbxrefs:
|
|
if ref not in answer.dbxrefs:
|
|
answer.dbxrefs.append(ref)
|
|
|
|
# Take common id/name/description/annotation
|
|
if self.id == other.id:
|
|
answer.id = self.id
|
|
if self.name == other.name:
|
|
answer.name = self.name
|
|
if self.description == other.description:
|
|
answer.description = self.description
|
|
for k, v in self.annotations.items():
|
|
if k in other.annotations and other.annotations[k] == v:
|
|
answer.annotations[k] = v
|
|
# Can append matching per-letter-annotation
|
|
try:
|
|
# To make this type safe, we would need to make sure the types are compatible, eg: no adding tuples and str
|
|
for k, v in self.letter_annotations.items(): # type: ignore
|
|
if k in other.letter_annotations:
|
|
# avoid length checks, but otherwise equivalent to answer.letter_annotations[k] = v + other.letter_annotations[k]
|
|
dict.__setitem__(answer.letter_annotations, k, v + other.letter_annotations[k]) # type: ignore
|
|
except TypeError:
|
|
print("Failed while try to concatenate letter annotations")
|
|
raise
|
|
|
|
return answer
|
|
|
|
def __radd__(self, other: Union["Seq", "MutableSeq", str]) -> "SeqRecord":
|
|
"""Add another sequence or string to this sequence (from the left).
|
|
|
|
This method handles adding a Seq object (or similar, e.g. MutableSeq)
|
|
or a plain Python string (on the left) to a SeqRecord (on the right).
|
|
See the __add__ method for more details, but for example:
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
|
|
>>> print("%s %s" % (record.id, record.seq))
|
|
slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
|
|
>>> print(list(record.letter_annotations))
|
|
['solexa_quality']
|
|
|
|
>>> new = "ACT" + record
|
|
>>> print("%s %s" % (new.id, new.seq))
|
|
slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
|
|
>>> print(list(new.letter_annotations))
|
|
[]
|
|
"""
|
|
if isinstance(other, SeqRecord):
|
|
raise RuntimeError(
|
|
"This should have happened via the __add__ of "
|
|
"the other SeqRecord being added!"
|
|
)
|
|
if self.seq is None:
|
|
raise TypeError("Can't add (right hand side) SeqRecord with seq = None")
|
|
# Assume it is a string or a Seq.
|
|
# Note can't transfer any per-letter-annotations
|
|
offset = len(other)
|
|
return type(self)(
|
|
cast(Union[Seq, MutableSeq], other + self.seq),
|
|
id=self.id,
|
|
name=self.name,
|
|
description=self.description,
|
|
features=[f._shift(offset) for f in self.features],
|
|
annotations=self.annotations.copy(),
|
|
dbxrefs=self.dbxrefs[:],
|
|
)
|
|
|
|
def count(self, sub, start=None, end=None):
|
|
"""Return the number of non-overlapping occurrences of sub in seq[start:end].
|
|
|
|
Optional arguments start and end are interpreted as in slice notation.
|
|
This method behaves as the count method of Python strings.
|
|
"""
|
|
if self._seq is None:
|
|
raise ValueError(
|
|
"seq is SeqRecord is None, assign it a sequence before applying count"
|
|
)
|
|
return self.seq.count(sub, start, end)
|
|
|
|
def upper(self) -> "SeqRecord":
|
|
"""Return a copy of the record with an upper case sequence.
|
|
|
|
All the annotation is preserved unchanged. e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> record = SeqRecord(Seq("acgtACGT"), id="Test",
|
|
... description = "Made up for this example")
|
|
>>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8]
|
|
>>> print(record.upper().format("fastq"))
|
|
@Test Made up for this example
|
|
ACGTACGT
|
|
+
|
|
"#$%&'()
|
|
<BLANKLINE>
|
|
|
|
Naturally, there is a matching lower method:
|
|
|
|
>>> print(record.lower().format("fastq"))
|
|
@Test Made up for this example
|
|
acgtacgt
|
|
+
|
|
"#$%&'()
|
|
<BLANKLINE>
|
|
"""
|
|
if self.seq is None:
|
|
raise ValueError(
|
|
"seq is SeqRecord is None, assign it a sequence before applying upper"
|
|
)
|
|
return self._from_validated(
|
|
self.seq.upper(),
|
|
id=self.id,
|
|
name=self.name,
|
|
description=self.description,
|
|
dbxrefs=self.dbxrefs[:],
|
|
features=self.features[:],
|
|
annotations=self.annotations.copy(),
|
|
letter_annotations=self.letter_annotations.copy(),
|
|
)
|
|
|
|
def lower(self) -> "SeqRecord":
|
|
"""Return a copy of the record with a lower case sequence.
|
|
|
|
All the annotation is preserved unchanged. e.g.
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Fasta/aster.pro", "fasta")
|
|
>>> print(record.format("fasta"))
|
|
>gi|3298468|dbj|BAA31520.1| SAMIPF
|
|
GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG
|
|
VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
|
|
<BLANKLINE>
|
|
>>> print(record.lower().format("fasta"))
|
|
>gi|3298468|dbj|BAA31520.1| SAMIPF
|
|
gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg
|
|
vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani
|
|
<BLANKLINE>
|
|
|
|
To take a more annotation rich example,
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> old = SeqIO.read("EMBL/TRBG361.embl", "embl")
|
|
>>> len(old.features)
|
|
3
|
|
>>> new = old.lower()
|
|
>>> len(old.features) == len(new.features)
|
|
True
|
|
>>> old.annotations["organism"] == new.annotations["organism"]
|
|
True
|
|
>>> old.dbxrefs == new.dbxrefs
|
|
True
|
|
"""
|
|
if self.seq is None:
|
|
raise ValueError(
|
|
"seq is SeqRecord is None, assign it a sequence before applying lower"
|
|
)
|
|
return self._from_validated(
|
|
self.seq.lower(),
|
|
id=self.id,
|
|
name=self.name,
|
|
description=self.description,
|
|
dbxrefs=self.dbxrefs[:],
|
|
features=self.features[:],
|
|
annotations=self.annotations.copy(),
|
|
letter_annotations=self.letter_annotations.copy(),
|
|
)
|
|
|
|
def isupper(self):
|
|
"""Return True if all ASCII characters in the record's sequence are uppercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
return self.seq.isupper()
|
|
|
|
def islower(self):
|
|
"""Return True if all ASCII characters in the record's sequence are lowercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
return self.seq.islower()
|
|
|
|
def reverse_complement(
|
|
self,
|
|
id: bool = False,
|
|
name: bool = False,
|
|
description: bool = False,
|
|
features: bool = True,
|
|
annotations: bool = False,
|
|
letter_annotations: bool = True,
|
|
dbxrefs: bool = False,
|
|
) -> "SeqRecord":
|
|
"""Return new SeqRecord with reverse complement sequence.
|
|
|
|
By default the new record does NOT preserve the sequence identifier,
|
|
name, description, general annotation or database cross-references -
|
|
these are unlikely to apply to the reversed sequence.
|
|
|
|
You can specify the returned record's id, name and description as
|
|
strings, or True to keep that of the parent, or False for a default.
|
|
|
|
You can specify the returned record's features with a list of
|
|
SeqFeature objects, or True to keep that of the parent, or False to
|
|
omit them. The default is to keep the original features (with the
|
|
strand and locations adjusted).
|
|
|
|
You can also specify both the returned record's annotations and
|
|
letter_annotations as dictionaries, True to keep that of the parent,
|
|
or False to omit them. The default is to keep the original
|
|
annotations (with the letter annotations reversed).
|
|
|
|
To show what happens to the pre-letter annotations, consider an
|
|
example Solexa variant FASTQ file with a single entry, which we'll
|
|
read in as a SeqRecord:
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
|
|
>>> print("%s %s" % (record.id, record.seq))
|
|
slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
|
|
>>> print(list(record.letter_annotations))
|
|
['solexa_quality']
|
|
>>> print(record.letter_annotations["solexa_quality"])
|
|
[40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
|
|
|
|
Now take the reverse complement, here we explicitly give a new
|
|
identifier (the old identifier with a suffix):
|
|
|
|
>>> rc_record = record.reverse_complement(id=record.id + "_rc")
|
|
>>> print("%s %s" % (rc_record.id, rc_record.seq))
|
|
slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
|
|
|
|
Notice that the per-letter-annotations have also been reversed,
|
|
although this may not be appropriate for all cases.
|
|
|
|
>>> print(rc_record.letter_annotations["solexa_quality"])
|
|
[-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
|
|
|
|
Now for the features, we need a different example. Parsing a GenBank
|
|
file is probably the easiest way to get an nice example with features
|
|
in it...
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> with open("GenBank/pBAD30.gb") as handle:
|
|
... plasmid = SeqIO.read(handle, "gb")
|
|
>>> print("%s %i" % (plasmid.id, len(plasmid)))
|
|
pBAD30 4923
|
|
>>> plasmid.seq
|
|
Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG')
|
|
>>> len(plasmid.features)
|
|
13
|
|
|
|
Now, let's take the reverse complement of this whole plasmid:
|
|
|
|
>>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc")
|
|
>>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid)))
|
|
pBAD30_rc 4923
|
|
>>> rc_plasmid.seq
|
|
Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC')
|
|
>>> len(rc_plasmid.features)
|
|
13
|
|
|
|
Let's compare the first CDS feature - it has gone from being the
|
|
second feature (index 1) to the second last feature (index -2), its
|
|
strand has changed, and the location switched round.
|
|
|
|
>>> print(plasmid.features[1])
|
|
type: CDS
|
|
location: [1081:1960](-)
|
|
qualifiers:
|
|
Key: label, Value: ['araC']
|
|
Key: note, Value: ['araC regulator of the arabinose BAD promoter']
|
|
Key: vntifkey, Value: ['4']
|
|
<BLANKLINE>
|
|
>>> print(rc_plasmid.features[-2])
|
|
type: CDS
|
|
location: [2963:3842](+)
|
|
qualifiers:
|
|
Key: label, Value: ['araC']
|
|
Key: note, Value: ['araC regulator of the arabinose BAD promoter']
|
|
Key: vntifkey, Value: ['4']
|
|
<BLANKLINE>
|
|
|
|
You can check this new location, based on the length of the plasmid:
|
|
|
|
>>> len(plasmid) - 1081
|
|
3842
|
|
>>> len(plasmid) - 1960
|
|
2963
|
|
|
|
Note that if the SeqFeature annotation includes any strand specific
|
|
information (e.g. base changes for a SNP), this information is not
|
|
amended, and would need correction after the reverse complement.
|
|
|
|
Note trying to reverse complement a protein SeqRecord raises an
|
|
exception:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> protein_rec = SeqRecord(Seq("MAIVMGR"), id="Test",
|
|
... annotations={"molecule_type": "protein"})
|
|
>>> protein_rec.reverse_complement()
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: Proteins do not have complements!
|
|
|
|
If you have RNA without any U bases, it must be annotated as RNA
|
|
otherwise it will be treated as DNA by default with A mapped to T:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> rna1 = SeqRecord(Seq("ACG"), id="Test")
|
|
>>> rna2 = SeqRecord(Seq("ACG"), id="Test", annotations={"molecule_type": "RNA"})
|
|
>>> print(rna1.reverse_complement(id="RC", description="unk").format("fasta"))
|
|
>RC unk
|
|
CGT
|
|
<BLANKLINE>
|
|
>>> print(rna2.reverse_complement(id="RC", description="RNA").format("fasta"))
|
|
>RC RNA
|
|
CGU
|
|
<BLANKLINE>
|
|
|
|
Also note you can reverse complement a SeqRecord using a MutableSeq:
|
|
|
|
>>> from Bio.Seq import MutableSeq
|
|
>>> from Bio.SeqRecord import SeqRecord
|
|
>>> rec = SeqRecord(MutableSeq("ACGT"), id="Test")
|
|
>>> rec.seq[0] = "T"
|
|
>>> print("%s %s" % (rec.id, rec.seq))
|
|
Test TCGT
|
|
>>> rc = rec.reverse_complement(id=True)
|
|
>>> print("%s %s" % (rc.id, rc.seq))
|
|
Test ACGA
|
|
"""
|
|
|
|
if self.seq is None:
|
|
raise ValueError(
|
|
"Seq in SeqRecord is None, so can't construct the reverse_complement. Please assign it a sequence first"
|
|
)
|
|
|
|
if "protein" in cast(str, self.annotations.get("molecule_type", "")):
|
|
raise ValueError("Proteins do not have complements!")
|
|
if "RNA" in cast(str, self.annotations.get("molecule_type", "")):
|
|
seq = self.seq.reverse_complement_rna()
|
|
else:
|
|
# Default to DNA
|
|
seq = self.seq.reverse_complement()
|
|
if isinstance(self.seq, MutableSeq):
|
|
seq = Seq(seq)
|
|
answer = self._from_validated(seq)
|
|
if isinstance(id, str):
|
|
answer.id = id
|
|
elif id:
|
|
answer.id = self.id
|
|
if isinstance(name, str):
|
|
answer.name = name
|
|
elif name:
|
|
answer.name = self.name
|
|
if isinstance(description, str):
|
|
answer.description = description
|
|
elif description:
|
|
answer.description = self.description
|
|
if isinstance(dbxrefs, list):
|
|
answer.dbxrefs = dbxrefs
|
|
elif dbxrefs:
|
|
# Copy the old dbxrefs
|
|
answer.dbxrefs = self.dbxrefs[:]
|
|
if isinstance(features, list):
|
|
answer.features = features
|
|
elif features:
|
|
# Copy the old features, adjusting location and string
|
|
length = len(answer)
|
|
answer.features = [f._flip(length) for f in self.features]
|
|
|
|
# The old list should have been sorted by start location,
|
|
# reversing it will leave it sorted by what is now the end position,
|
|
# so we need to resort in case of overlapping features.
|
|
# NOTE - In the common case of gene before CDS (and similar) with
|
|
# the exact same locations, this will still maintain gene before CDS
|
|
|
|
def key_fun(f):
|
|
"""Sort on start position."""
|
|
try:
|
|
return int(f.location.start)
|
|
except TypeError: # Expected for UnknownPosition
|
|
return None
|
|
|
|
answer.features.sort(key=key_fun)
|
|
if isinstance(annotations, dict):
|
|
answer.annotations = annotations
|
|
elif annotations:
|
|
# Copy the old annotations,
|
|
answer.annotations = self.annotations.copy()
|
|
if isinstance(letter_annotations, dict):
|
|
answer.letter_annotations = letter_annotations
|
|
elif letter_annotations:
|
|
# Copy the old per letter annotations, reversing them
|
|
for key, value in self.letter_annotations.items():
|
|
answer.letter_annotations[key] = value[::-1]
|
|
return answer
|
|
|
|
def translate(
|
|
self,
|
|
# Seq translation arguments:
|
|
table: str = "Standard",
|
|
stop_symbol: str = "*",
|
|
to_stop: bool = False,
|
|
cds: bool = False,
|
|
gap: Optional[str] = None,
|
|
# SeqRecord annotation arguments:
|
|
id: bool = False,
|
|
name: bool = False,
|
|
description: bool = False,
|
|
features: bool = False,
|
|
annotations: bool = False,
|
|
letter_annotations: bool = False,
|
|
dbxrefs: bool = False,
|
|
) -> "SeqRecord":
|
|
"""Return new SeqRecord with translated sequence.
|
|
|
|
This calls the record's .seq.translate() method (which describes
|
|
the translation related arguments, like table for the genetic code),
|
|
|
|
By default the new record does NOT preserve the sequence identifier,
|
|
name, description, general annotation or database cross-references -
|
|
these are unlikely to apply to the translated sequence.
|
|
|
|
You can specify the returned record's id, name and description as
|
|
strings, or True to keep that of the parent, or False for a default.
|
|
|
|
You can specify the returned record's features with a list of
|
|
SeqFeature objects, or False (default) to omit them.
|
|
|
|
You can also specify both the returned record's annotations and
|
|
letter_annotations as dictionaries, True to keep that of the parent
|
|
(annotations only), or False (default) to omit them.
|
|
|
|
e.g. Loading a FASTA gene and translating it,
|
|
|
|
>>> from Bio import SeqIO
|
|
>>> gene_record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
|
|
>>> print(gene_record.format("fasta"))
|
|
>gi|3176602|gb|U78617.1|LOU78617 Lathyrus odoratus phytochrome A (PHYA) gene, partial cds
|
|
CAGGCTGCGCGGTTTCTATTTATGAAGAACAAGGTCCGTATGATAGTTGATTGTCATGCA
|
|
AAACATGTGAAGGTTCTTCAAGACGAAAAACTCCCATTTGATTTGACTCTGTGCGGTTCG
|
|
ACCTTAAGAGCTCCACATAGTTGCCATTTGCAGTACATGGCTAACATGGATTCAATTGCT
|
|
TCATTGGTTATGGCAGTGGTCGTCAATGACAGCGATGAAGATGGAGATAGCCGTGACGCA
|
|
GTTCTACCACAAAAGAAAAAGAGACTTTGGGGTTTGGTAGTTTGTCATAACACTACTCCG
|
|
AGGTTTGTT
|
|
<BLANKLINE>
|
|
|
|
And now translating the record, specifying the new ID and description:
|
|
|
|
>>> protein_record = gene_record.translate(table=11,
|
|
... id="phya",
|
|
... description="translation")
|
|
>>> print(protein_record.format("fasta"))
|
|
>phya translation
|
|
QAARFLFMKNKVRMIVDCHAKHVKVLQDEKLPFDLTLCGSTLRAPHSCHLQYMANMDSIA
|
|
SLVMAVVVNDSDEDGDSRDAVLPQKKKRLWGLVVCHNTTPRFV
|
|
<BLANKLINE>
|
|
|
|
"""
|
|
if "protein" == self.annotations.get("molecule_type", ""):
|
|
raise ValueError("Proteins cannot be translated!")
|
|
|
|
if self.seq is None:
|
|
raise ValueError("Seq in SeqRecord is None, can't be translated")
|
|
|
|
answer = SeqRecord(
|
|
self.seq.translate(
|
|
table=table, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap
|
|
)
|
|
)
|
|
if isinstance(id, str):
|
|
answer.id = id
|
|
elif id:
|
|
answer.id = self.id
|
|
if isinstance(name, str):
|
|
answer.name = name
|
|
elif name:
|
|
answer.name = self.name
|
|
if isinstance(description, str):
|
|
answer.description = description
|
|
elif description:
|
|
answer.description = self.description
|
|
if isinstance(dbxrefs, list):
|
|
answer.dbxrefs = dbxrefs
|
|
elif dbxrefs:
|
|
# Copy the old dbxrefs
|
|
answer.dbxrefs = self.dbxrefs[:]
|
|
if isinstance(features, list):
|
|
answer.features = features
|
|
elif features:
|
|
# Does not make sense to copy old features as locations wrong
|
|
raise TypeError(f"Unexpected features argument {features!r}")
|
|
if isinstance(annotations, dict):
|
|
answer.annotations = annotations
|
|
elif annotations:
|
|
# Copy the old annotations
|
|
answer.annotations = self.annotations.copy()
|
|
# Set/update to protein:
|
|
answer.annotations["molecule_type"] = "protein"
|
|
if isinstance(letter_annotations, dict):
|
|
answer.letter_annotations = letter_annotations
|
|
elif letter_annotations:
|
|
# Does not make sense to copy these as length now wrong
|
|
raise TypeError(
|
|
f"Unexpected letter_annotations argument {letter_annotations!r}"
|
|
)
|
|
return answer
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from Bio._utils import run_doctest
|
|
|
|
run_doctest()
|