mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
$ pyupgrade --keep-percent-format --py39-plus \ Bio/*.py Bio/*/*.py BioSQL/*.py Didn't find any changes in tests, scripts, or docs. Followed by removing a few now redundant imports and black in one case.
3292 lines
112 KiB
Python
3292 lines
112 KiB
Python
# Copyright 2000 Andrew Dalke.
|
|
# Copyright 2000-2002 Brad Chapman.
|
|
# Copyright 2004-2005, 2010 by M de Hoon.
|
|
# Copyright 2007-2023 by Peter Cock.
|
|
# All rights reserved.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Provide objects to represent biological sequences.
|
|
|
|
See also the Seq_ wiki and the chapter in our tutorial:
|
|
- `HTML Tutorial`_
|
|
- `PDF Tutorial`_
|
|
|
|
.. _Seq: http://biopython.org/wiki/Seq
|
|
.. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
|
|
.. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
|
|
|
|
"""
|
|
|
|
import array
|
|
import collections
|
|
import numbers
|
|
import warnings
|
|
from abc import ABC
|
|
from abc import abstractmethod
|
|
from typing import Dict
|
|
from typing import Optional
|
|
from typing import overload
|
|
from typing import Union
|
|
|
|
from Bio import BiopythonWarning
|
|
from Bio.Data import CodonTable
|
|
from Bio.Data import IUPACData
|
|
|
|
|
|
def _maketrans(complement_mapping):
|
|
"""Make a python string translation table (PRIVATE).
|
|
|
|
Arguments:
|
|
- complement_mapping - a dictionary such as ambiguous_dna_complement
|
|
and ambiguous_rna_complement from Data.IUPACData.
|
|
|
|
Returns a translation table (a bytes object of length 256) for use with
|
|
the python string's translate method to use in a (reverse) complement.
|
|
|
|
Compatible with lower case and upper case sequences.
|
|
|
|
For internal use only.
|
|
"""
|
|
keys = "".join(complement_mapping.keys()).encode("ASCII")
|
|
values = "".join(complement_mapping.values()).encode("ASCII")
|
|
return bytes.maketrans(keys + keys.lower(), values + values.lower())
|
|
|
|
|
|
ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
|
|
ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
|
|
_dna_complement_table = _maketrans(ambiguous_dna_complement)
|
|
del ambiguous_dna_complement
|
|
ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
|
|
ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
|
|
_rna_complement_table = _maketrans(ambiguous_rna_complement)
|
|
del ambiguous_rna_complement
|
|
|
|
|
|
class SequenceDataAbstractBaseClass(ABC):
|
|
"""Abstract base class for sequence content providers.
|
|
|
|
Most users will not need to use this class. It is used internally as a base
|
|
class for sequence content provider classes such as _UndefinedSequenceData
|
|
defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
|
|
Instances of these classes can be used instead of a ``bytes`` object as the
|
|
data argument when creating a Seq object, and provide the sequence content
|
|
only when requested via ``__getitem__``. This allows lazy parsers to load
|
|
and parse sequence data from a file only for the requested sequence regions,
|
|
and _UndefinedSequenceData instances to raise an exception when undefined
|
|
sequence data are requested.
|
|
|
|
Future implementations of lazy parsers that similarly provide on-demand
|
|
parsing of sequence data should use a subclass of this abstract class and
|
|
implement the abstract methods ``__len__`` and ``__getitem__``:
|
|
|
|
* ``__len__`` must return the sequence length;
|
|
* ``__getitem__`` must return
|
|
|
|
* a ``bytes`` object for the requested region; or
|
|
* a new instance of the subclass for the requested region; or
|
|
* raise an ``UndefinedSequenceError``.
|
|
|
|
Calling ``__getitem__`` for a sequence region of size zero should always
|
|
return an empty ``bytes`` object.
|
|
Calling ``__getitem__`` for the full sequence (as in data[:]) should
|
|
either return a ``bytes`` object with the full sequence, or raise an
|
|
``UndefinedSequenceError``.
|
|
|
|
Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
|
|
as part of their ``__init__`` method.
|
|
"""
|
|
|
|
__slots__ = ()
|
|
|
|
def __init__(self):
|
|
"""Check if ``__getitem__`` returns a bytes-like object."""
|
|
assert self[:0] == b""
|
|
|
|
@abstractmethod
|
|
def __len__(self):
|
|
pass
|
|
|
|
@abstractmethod
|
|
def __getitem__(self, key):
|
|
pass
|
|
|
|
def __bytes__(self):
|
|
return self[:]
|
|
|
|
def __hash__(self):
|
|
return hash(bytes(self))
|
|
|
|
def __eq__(self, other):
|
|
return bytes(self) == other
|
|
|
|
def __lt__(self, other):
|
|
return bytes(self) < other
|
|
|
|
def __le__(self, other):
|
|
return bytes(self) <= other
|
|
|
|
def __gt__(self, other):
|
|
return bytes(self) > other
|
|
|
|
def __ge__(self, other):
|
|
return bytes(self) >= other
|
|
|
|
def __add__(self, other):
|
|
try:
|
|
return bytes(self) + bytes(other)
|
|
except UndefinedSequenceError:
|
|
return NotImplemented
|
|
# will be handled by _UndefinedSequenceData.__radd__ or
|
|
# by _PartiallyDefinedSequenceData.__radd__
|
|
|
|
def __radd__(self, other):
|
|
return other + bytes(self)
|
|
|
|
def __mul__(self, other):
|
|
return other * bytes(self)
|
|
|
|
def __contains__(self, item):
|
|
return bytes(self).__contains__(item)
|
|
|
|
def decode(self, encoding="utf-8"):
|
|
"""Decode the data as bytes using the codec registered for encoding.
|
|
|
|
encoding
|
|
The encoding with which to decode the bytes.
|
|
"""
|
|
return bytes(self).decode(encoding)
|
|
|
|
def count(self, sub, start=None, end=None):
|
|
"""Return the number of non-overlapping occurrences of sub in data[start:end].
|
|
|
|
Optional arguments start and end are interpreted as in slice notation.
|
|
This method behaves as the count method of Python strings.
|
|
"""
|
|
return bytes(self).count(sub, start, end)
|
|
|
|
def find(self, sub, start=None, end=None):
|
|
"""Return the lowest index in data where subsection sub is found.
|
|
|
|
Return the lowest index in data where subsection sub is found,
|
|
such that sub is contained within data[start,end]. Optional
|
|
arguments start and end are interpreted as in slice notation.
|
|
|
|
Return -1 on failure.
|
|
"""
|
|
return bytes(self).find(sub, start, end)
|
|
|
|
def rfind(self, sub, start=None, end=None):
|
|
"""Return the highest index in data where subsection sub is found.
|
|
|
|
Return the highest index in data where subsection sub is found,
|
|
such that sub is contained within data[start,end]. Optional
|
|
arguments start and end are interpreted as in slice notation.
|
|
|
|
Return -1 on failure.
|
|
"""
|
|
return bytes(self).rfind(sub, start, end)
|
|
|
|
def index(self, sub, start=None, end=None):
|
|
"""Return the lowest index in data where subsection sub is found.
|
|
|
|
Return the lowest index in data where subsection sub is found,
|
|
such that sub is contained within data[start,end]. Optional
|
|
arguments start and end are interpreted as in slice notation.
|
|
|
|
Raises ValueError when the subsection is not found.
|
|
"""
|
|
return bytes(self).index(sub, start, end)
|
|
|
|
def rindex(self, sub, start=None, end=None):
|
|
"""Return the highest index in data where subsection sub is found.
|
|
|
|
Return the highest index in data where subsection sub is found,
|
|
such that sub is contained within data[start,end]. Optional
|
|
arguments start and end are interpreted as in slice notation.
|
|
|
|
Raise ValueError when the subsection is not found.
|
|
"""
|
|
return bytes(self).rindex(sub, start, end)
|
|
|
|
def startswith(self, prefix, start=None, end=None):
|
|
"""Return True if data starts with the specified prefix, False otherwise.
|
|
|
|
With optional start, test data beginning at that position.
|
|
With optional end, stop comparing data at that position.
|
|
prefix can also be a tuple of bytes to try.
|
|
"""
|
|
return bytes(self).startswith(prefix, start, end)
|
|
|
|
def endswith(self, suffix, start=None, end=None):
|
|
"""Return True if data ends with the specified suffix, False otherwise.
|
|
|
|
With optional start, test data beginning at that position.
|
|
With optional end, stop comparing data at that position.
|
|
suffix can also be a tuple of bytes to try.
|
|
"""
|
|
return bytes(self).endswith(suffix, start, end)
|
|
|
|
def split(self, sep=None, maxsplit=-1):
|
|
"""Return a list of the sections in the data, using sep as the delimiter.
|
|
|
|
sep
|
|
The delimiter according which to split the data.
|
|
None (the default value) means split on ASCII whitespace characters
|
|
(space, tab, return, newline, formfeed, vertical tab).
|
|
maxsplit
|
|
Maximum number of splits to do.
|
|
-1 (the default value) means no limit.
|
|
"""
|
|
return bytes(self).split(sep, maxsplit)
|
|
|
|
def rsplit(self, sep=None, maxsplit=-1):
|
|
"""Return a list of the sections in the data, using sep as the delimiter.
|
|
|
|
sep
|
|
The delimiter according which to split the data.
|
|
None (the default value) means split on ASCII whitespace characters
|
|
(space, tab, return, newline, formfeed, vertical tab).
|
|
maxsplit
|
|
Maximum number of splits to do.
|
|
-1 (the default value) means no limit.
|
|
|
|
Splitting is done starting at the end of the data and working to the front.
|
|
"""
|
|
return bytes(self).rsplit(sep, maxsplit)
|
|
|
|
def strip(self, chars=None):
|
|
"""Strip leading and trailing characters contained in the argument.
|
|
|
|
If the argument is omitted or None, strip leading and trailing ASCII whitespace.
|
|
"""
|
|
return bytes(self).strip(chars)
|
|
|
|
def lstrip(self, chars=None):
|
|
"""Strip leading characters contained in the argument.
|
|
|
|
If the argument is omitted or None, strip leading ASCII whitespace.
|
|
"""
|
|
return bytes(self).lstrip(chars)
|
|
|
|
def rstrip(self, chars=None):
|
|
"""Strip trailing characters contained in the argument.
|
|
|
|
If the argument is omitted or None, strip trailing ASCII whitespace.
|
|
"""
|
|
return bytes(self).rstrip(chars)
|
|
|
|
def removeprefix(self, prefix):
|
|
"""Remove the prefix if present."""
|
|
# Want to do just this, but need Python 3.9+
|
|
# return bytes(self).removeprefix(prefix)
|
|
data = bytes(self)
|
|
try:
|
|
return data.removeprefix(prefix)
|
|
except AttributeError:
|
|
if data.startswith(prefix):
|
|
return data[len(prefix) :]
|
|
else:
|
|
return data
|
|
|
|
def removesuffix(self, suffix):
|
|
"""Remove the suffix if present."""
|
|
# Want to do just this, but need Python 3.9+
|
|
# return bytes(self).removesuffix(suffix)
|
|
data = bytes(self)
|
|
try:
|
|
return data.removesuffix(suffix)
|
|
except AttributeError:
|
|
if data.startswith(suffix):
|
|
return data[: -len(suffix)]
|
|
else:
|
|
return data
|
|
|
|
def upper(self):
|
|
"""Return a copy of data with all ASCII characters converted to uppercase."""
|
|
return bytes(self).upper()
|
|
|
|
def lower(self):
|
|
"""Return a copy of data with all ASCII characters converted to lowercase."""
|
|
return bytes(self).lower()
|
|
|
|
def isupper(self):
|
|
"""Return True if all ASCII characters in data are uppercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
return bytes(self).isupper()
|
|
|
|
def islower(self):
|
|
"""Return True if all ASCII characters in data are lowercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
return bytes(self).islower()
|
|
|
|
def replace(self, old, new):
|
|
"""Return a copy with all occurrences of substring old replaced by new."""
|
|
return bytes(self).replace(old, new)
|
|
|
|
def translate(self, table, delete=b""):
|
|
"""Return a copy with each character mapped by the given translation table.
|
|
|
|
table
|
|
Translation table, which must be a bytes object of length 256.
|
|
|
|
All characters occurring in the optional argument delete are removed.
|
|
The remaining characters are mapped through the given translation table.
|
|
"""
|
|
return bytes(self).translate(table, delete)
|
|
|
|
@property
|
|
def defined(self):
|
|
"""Return True if the sequence is defined, False if undefined or partially defined.
|
|
|
|
Zero-length sequences are always considered to be defined.
|
|
"""
|
|
return True
|
|
|
|
@property
|
|
def defined_ranges(self):
|
|
"""Return a tuple of the ranges where the sequence contents is defined.
|
|
|
|
The return value has the format ((start1, end1), (start2, end2), ...).
|
|
"""
|
|
length = len(self)
|
|
if length > 0:
|
|
return ((0, length),)
|
|
else:
|
|
return ()
|
|
|
|
|
|
class _SeqAbstractBaseClass(ABC):
|
|
"""Abstract base class for the Seq and MutableSeq classes (PRIVATE).
|
|
|
|
Most users will not need to use this class. It is used internally as an
|
|
abstract base class for Seq and MutableSeq, as most of their methods are
|
|
identical.
|
|
"""
|
|
|
|
__slots__ = ("_data",)
|
|
__array_ufunc__ = None # turn off numpy Ufuncs
|
|
|
|
@abstractmethod
|
|
def __init__(self):
|
|
pass
|
|
|
|
def __bytes__(self):
|
|
return bytes(self._data)
|
|
|
|
def __repr__(self):
|
|
"""Return (truncated) representation of the sequence."""
|
|
data = self._data
|
|
if isinstance(data, _UndefinedSequenceData):
|
|
return f"Seq(None, length={len(self)})"
|
|
if isinstance(data, _PartiallyDefinedSequenceData):
|
|
d = {}
|
|
for position, seq in data._data.items():
|
|
if len(seq) > 60:
|
|
start = seq[:54].decode("ASCII")
|
|
end = seq[-3:].decode("ASCII")
|
|
seq = f"{start}...{end}"
|
|
else:
|
|
seq = seq.decode("ASCII")
|
|
d[position] = seq
|
|
return "Seq(%r, length=%d)" % (d, len(self))
|
|
if len(data) > 60:
|
|
# Shows the last three letters as it is often useful to see if
|
|
# there is a stop codon at the end of a sequence.
|
|
# Note total length is 54+3+3=60
|
|
start = data[:54].decode("ASCII")
|
|
end = data[-3:].decode("ASCII")
|
|
return f"{self.__class__.__name__}('{start}...{end}')"
|
|
else:
|
|
data = data.decode("ASCII")
|
|
return f"{self.__class__.__name__}('{data}')"
|
|
|
|
def __str__(self):
|
|
"""Return the full sequence as a python string."""
|
|
return self._data.decode("ASCII")
|
|
|
|
def __eq__(self, other):
|
|
"""Compare the sequence to another sequence or a string.
|
|
|
|
Sequences are equal to each other if their sequence contents is
|
|
identical:
|
|
|
|
>>> from Bio.Seq import Seq, MutableSeq
|
|
>>> seq1 = Seq("ACGT")
|
|
>>> seq2 = Seq("ACGT")
|
|
>>> mutable_seq = MutableSeq("ACGT")
|
|
>>> seq1 == seq2
|
|
True
|
|
>>> seq1 == mutable_seq
|
|
True
|
|
>>> seq1 == "ACGT"
|
|
True
|
|
|
|
Note that the sequence objects themselves are not identical to each
|
|
other:
|
|
|
|
>>> id(seq1) == id(seq2)
|
|
False
|
|
>>> seq1 is seq2
|
|
False
|
|
|
|
Sequences can also be compared to strings, ``bytes``, and ``bytearray``
|
|
objects:
|
|
|
|
>>> seq1 == "ACGT"
|
|
True
|
|
>>> seq1 == b"ACGT"
|
|
True
|
|
>>> seq1 == bytearray(b"ACGT")
|
|
True
|
|
"""
|
|
if isinstance(other, _SeqAbstractBaseClass):
|
|
return self._data == other._data
|
|
elif isinstance(other, str):
|
|
return self._data == other.encode("ASCII")
|
|
else:
|
|
return self._data == other
|
|
|
|
def __lt__(self, other):
|
|
"""Implement the less-than operand."""
|
|
if isinstance(other, _SeqAbstractBaseClass):
|
|
return self._data < other._data
|
|
elif isinstance(other, str):
|
|
return self._data < other.encode("ASCII")
|
|
else:
|
|
return self._data < other
|
|
|
|
def __le__(self, other):
|
|
"""Implement the less-than or equal operand."""
|
|
if isinstance(other, _SeqAbstractBaseClass):
|
|
return self._data <= other._data
|
|
elif isinstance(other, str):
|
|
return self._data <= other.encode("ASCII")
|
|
else:
|
|
return self._data <= other
|
|
|
|
def __gt__(self, other):
|
|
"""Implement the greater-than operand."""
|
|
if isinstance(other, _SeqAbstractBaseClass):
|
|
return self._data > other._data
|
|
elif isinstance(other, str):
|
|
return self._data > other.encode("ASCII")
|
|
else:
|
|
return self._data > other
|
|
|
|
def __ge__(self, other):
|
|
"""Implement the greater-than or equal operand."""
|
|
if isinstance(other, _SeqAbstractBaseClass):
|
|
return self._data >= other._data
|
|
elif isinstance(other, str):
|
|
return self._data >= other.encode("ASCII")
|
|
else:
|
|
return self._data >= other
|
|
|
|
def __len__(self):
|
|
"""Return the length of the sequence."""
|
|
return len(self._data)
|
|
|
|
def __iter__(self):
|
|
"""Return an iterable of the sequence."""
|
|
return self._data.decode("ASCII").__iter__()
|
|
|
|
@overload
|
|
def __getitem__(self, index: int) -> str: ...
|
|
|
|
@overload
|
|
def __getitem__(self, index: slice) -> "Seq": ...
|
|
|
|
def __getitem__(self, index):
|
|
"""Return a subsequence as a single letter or as a sequence object.
|
|
|
|
If the index is an integer, a single letter is returned as a Python
|
|
string:
|
|
|
|
>>> seq = Seq('ACTCGACGTCG')
|
|
>>> seq[5]
|
|
'A'
|
|
|
|
Otherwise, a new sequence object of the same class is returned:
|
|
|
|
>>> seq[5:8]
|
|
Seq('ACG')
|
|
>>> mutable_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> mutable_seq[5:8]
|
|
MutableSeq('ACG')
|
|
"""
|
|
if isinstance(index, numbers.Integral):
|
|
# Return a single letter as a string
|
|
return chr(self._data[index])
|
|
else:
|
|
# Return the (sub)sequence as another Seq/MutableSeq object
|
|
return self.__class__(self._data[index])
|
|
|
|
def __add__(self, other):
|
|
"""Add a sequence or string to this sequence.
|
|
|
|
>>> from Bio.Seq import Seq, MutableSeq
|
|
>>> Seq("MELKI") + "LV"
|
|
Seq('MELKILV')
|
|
>>> MutableSeq("MELKI") + "LV"
|
|
MutableSeq('MELKILV')
|
|
"""
|
|
if isinstance(other, _SeqAbstractBaseClass):
|
|
return self.__class__(self._data + other._data)
|
|
elif isinstance(other, str):
|
|
return self.__class__(self._data + other.encode("ASCII"))
|
|
else:
|
|
# If other is a SeqRecord, then SeqRecord's __radd__ will handle
|
|
# this. If not, returning NotImplemented will trigger a TypeError.
|
|
return NotImplemented
|
|
|
|
def __radd__(self, other):
|
|
"""Add a sequence string on the left.
|
|
|
|
>>> from Bio.Seq import Seq, MutableSeq
|
|
>>> "LV" + Seq("MELKI")
|
|
Seq('LVMELKI')
|
|
>>> "LV" + MutableSeq("MELKI")
|
|
MutableSeq('LVMELKI')
|
|
|
|
Adding two sequence objects is handled via the __add__ method.
|
|
"""
|
|
if isinstance(other, str):
|
|
return self.__class__(other.encode("ASCII") + self._data)
|
|
else:
|
|
return NotImplemented
|
|
|
|
def __mul__(self, other):
|
|
"""Multiply sequence by integer.
|
|
|
|
>>> from Bio.Seq import Seq, MutableSeq
|
|
>>> Seq('ATG') * 2
|
|
Seq('ATGATG')
|
|
>>> MutableSeq('ATG') * 2
|
|
MutableSeq('ATGATG')
|
|
"""
|
|
if not isinstance(other, numbers.Integral):
|
|
raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
|
|
# we would like to simply write
|
|
# data = self._data * other
|
|
# here, but currently that causes a bug on PyPy if self._data is a
|
|
# bytearray and other is a numpy integer. Using this workaround:
|
|
data = self._data.__mul__(other)
|
|
return self.__class__(data)
|
|
|
|
def __rmul__(self, other):
|
|
"""Multiply integer by sequence.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> 2 * Seq('ATG')
|
|
Seq('ATGATG')
|
|
"""
|
|
if not isinstance(other, numbers.Integral):
|
|
raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
|
|
# we would like to simply write
|
|
# data = self._data * other
|
|
# here, but currently that causes a bug on PyPy if self._data is a
|
|
# bytearray and other is a numpy integer. Using this workaround:
|
|
data = self._data.__mul__(other)
|
|
return self.__class__(data)
|
|
|
|
def __imul__(self, other):
|
|
"""Multiply the sequence object by other and assign.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> seq = Seq('ATG')
|
|
>>> seq *= 2
|
|
>>> seq
|
|
Seq('ATGATG')
|
|
|
|
Note that this is different from in-place multiplication. The ``seq``
|
|
variable is reassigned to the multiplication result, but any variable
|
|
pointing to ``seq`` will remain unchanged:
|
|
|
|
>>> seq = Seq('ATG')
|
|
>>> seq2 = seq
|
|
>>> id(seq) == id(seq2)
|
|
True
|
|
>>> seq *= 2
|
|
>>> seq
|
|
Seq('ATGATG')
|
|
>>> seq2
|
|
Seq('ATG')
|
|
>>> id(seq) == id(seq2)
|
|
False
|
|
"""
|
|
if not isinstance(other, numbers.Integral):
|
|
raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
|
|
# we would like to simply write
|
|
# data = self._data * other
|
|
# here, but currently that causes a bug on PyPy if self._data is a
|
|
# bytearray and other is a numpy integer. Using this workaround:
|
|
data = self._data.__mul__(other)
|
|
return self.__class__(data)
|
|
|
|
def count(self, sub, start=None, end=None):
|
|
"""Return a non-overlapping count, like that of a python string.
|
|
|
|
The number of occurrences of substring argument sub in the
|
|
(sub)sequence given by [start:end] is returned as an integer.
|
|
Optional arguments start and end are interpreted as in slice
|
|
notation.
|
|
|
|
Arguments:
|
|
- sub - a string or another Seq object to look for
|
|
- start - optional integer, slice start
|
|
- end - optional integer, slice end
|
|
|
|
e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_seq = Seq("AAAATGA")
|
|
>>> print(my_seq.count("A"))
|
|
5
|
|
>>> print(my_seq.count("ATG"))
|
|
1
|
|
>>> print(my_seq.count(Seq("AT")))
|
|
1
|
|
>>> print(my_seq.count("AT", 2, -1))
|
|
1
|
|
|
|
HOWEVER, please note because the ``count`` method of Seq and MutableSeq
|
|
objects, like that of Python strings, do a non-overlapping search, this
|
|
may not give the answer you expect:
|
|
|
|
>>> "AAAA".count("AA")
|
|
2
|
|
>>> print(Seq("AAAA").count("AA"))
|
|
2
|
|
|
|
For an overlapping search, use the ``count_overlap`` method:
|
|
|
|
>>> print(Seq("AAAA").count_overlap("AA"))
|
|
3
|
|
"""
|
|
if isinstance(sub, MutableSeq):
|
|
sub = sub._data
|
|
elif isinstance(sub, Seq):
|
|
sub = bytes(sub)
|
|
elif isinstance(sub, str):
|
|
sub = sub.encode("ASCII")
|
|
elif not isinstance(sub, (bytes, bytearray)):
|
|
raise TypeError(
|
|
"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
|
% type(sub)
|
|
)
|
|
return self._data.count(sub, start, end)
|
|
|
|
def count_overlap(self, sub, start=None, end=None):
|
|
"""Return an overlapping count.
|
|
|
|
Returns an integer, the number of occurrences of substring
|
|
argument sub in the (sub)sequence given by [start:end].
|
|
Optional arguments start and end are interpreted as in slice
|
|
notation.
|
|
|
|
Arguments:
|
|
- sub - a string or another Seq object to look for
|
|
- start - optional integer, slice start
|
|
- end - optional integer, slice end
|
|
|
|
e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> print(Seq("AAAA").count_overlap("AA"))
|
|
3
|
|
>>> print(Seq("ATATATATA").count_overlap("ATA"))
|
|
4
|
|
>>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
|
|
1
|
|
|
|
For a non-overlapping search, use the ``count`` method:
|
|
|
|
>>> print(Seq("AAAA").count("AA"))
|
|
2
|
|
|
|
Where substrings do not overlap, ``count_overlap`` behaves the same as
|
|
the ``count`` method:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_seq = Seq("AAAATGA")
|
|
>>> print(my_seq.count_overlap("A"))
|
|
5
|
|
>>> my_seq.count_overlap("A") == my_seq.count("A")
|
|
True
|
|
>>> print(my_seq.count_overlap("ATG"))
|
|
1
|
|
>>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
|
|
True
|
|
>>> print(my_seq.count_overlap(Seq("AT")))
|
|
1
|
|
>>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
|
|
True
|
|
>>> print(my_seq.count_overlap("AT", 2, -1))
|
|
1
|
|
>>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
|
|
True
|
|
|
|
HOWEVER, do not use this method for such cases because the
|
|
count() method is much for efficient.
|
|
"""
|
|
if isinstance(sub, MutableSeq):
|
|
sub = sub._data
|
|
elif isinstance(sub, Seq):
|
|
sub = bytes(sub)
|
|
elif isinstance(sub, str):
|
|
sub = sub.encode("ASCII")
|
|
elif not isinstance(sub, (bytes, bytearray)):
|
|
raise TypeError(
|
|
"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
|
% type(sub)
|
|
)
|
|
data = self._data
|
|
overlap_count = 0
|
|
while True:
|
|
start = data.find(sub, start, end) + 1
|
|
if start != 0:
|
|
overlap_count += 1
|
|
else:
|
|
return overlap_count
|
|
|
|
def __contains__(self, item):
|
|
"""Return True if item is a subsequence of the sequence, and False otherwise.
|
|
|
|
e.g.
|
|
|
|
>>> from Bio.Seq import Seq, MutableSeq
|
|
>>> my_dna = Seq("ATATGAAATTTGAAAA")
|
|
>>> "AAA" in my_dna
|
|
True
|
|
>>> Seq("AAA") in my_dna
|
|
True
|
|
>>> MutableSeq("AAA") in my_dna
|
|
True
|
|
"""
|
|
if isinstance(item, _SeqAbstractBaseClass):
|
|
item = bytes(item)
|
|
elif isinstance(item, str):
|
|
item = item.encode("ASCII")
|
|
return item in self._data
|
|
|
|
def find(self, sub, start=None, end=None):
|
|
"""Return the lowest index in the sequence where subsequence sub is found.
|
|
|
|
With optional arguments start and end, return the lowest index in the
|
|
sequence such that the subsequence sub is contained within the sequence
|
|
region [start:end].
|
|
|
|
Arguments:
|
|
- sub - a string or another Seq or MutableSeq object to search for
|
|
- start - optional integer, slice start
|
|
- end - optional integer, slice end
|
|
|
|
Returns -1 if the subsequence is NOT found.
|
|
|
|
e.g. Locating the first typical start codon, AUG, in an RNA sequence:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_rna.find("AUG")
|
|
3
|
|
|
|
The next typical start codon can then be found by starting the search
|
|
at position 4:
|
|
|
|
>>> my_rna.find("AUG", 4)
|
|
15
|
|
|
|
See the ``search`` method to find the locations of multiple subsequences
|
|
at the same time.
|
|
"""
|
|
if isinstance(sub, _SeqAbstractBaseClass):
|
|
sub = bytes(sub)
|
|
elif isinstance(sub, str):
|
|
sub = sub.encode("ASCII")
|
|
elif not isinstance(sub, (bytes, bytearray)):
|
|
raise TypeError(
|
|
"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
|
% type(sub)
|
|
)
|
|
return self._data.find(sub, start, end)
|
|
|
|
def rfind(self, sub, start=None, end=None):
|
|
"""Return the highest index in the sequence where subsequence sub is found.
|
|
|
|
With optional arguments start and end, return the highest index in the
|
|
sequence such that the subsequence sub is contained within the sequence
|
|
region [start:end].
|
|
|
|
Arguments:
|
|
- sub - a string or another Seq or MutableSeq object to search for
|
|
- start - optional integer, slice start
|
|
- end - optional integer, slice end
|
|
|
|
Returns -1 if the subsequence is NOT found.
|
|
|
|
e.g. Locating the last typical start codon, AUG, in an RNA sequence:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_rna.rfind("AUG")
|
|
15
|
|
|
|
The location of the typical start codon before that can be found by
|
|
ending the search at position 15:
|
|
|
|
>>> my_rna.rfind("AUG", end=15)
|
|
3
|
|
|
|
See the ``search`` method to find the locations of multiple subsequences
|
|
at the same time.
|
|
"""
|
|
if isinstance(sub, _SeqAbstractBaseClass):
|
|
sub = bytes(sub)
|
|
elif isinstance(sub, str):
|
|
sub = sub.encode("ASCII")
|
|
elif not isinstance(sub, (bytes, bytearray)):
|
|
raise TypeError(
|
|
"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
|
% type(sub)
|
|
)
|
|
return self._data.rfind(sub, start, end)
|
|
|
|
def index(self, sub, start=None, end=None):
|
|
"""Return the lowest index in the sequence where subsequence sub is found.
|
|
|
|
With optional arguments start and end, return the lowest index in the
|
|
sequence such that the subsequence sub is contained within the sequence
|
|
region [start:end].
|
|
|
|
Arguments:
|
|
- sub - a string or another Seq or MutableSeq object to search for
|
|
- start - optional integer, slice start
|
|
- end - optional integer, slice end
|
|
|
|
Raises a ValueError if the subsequence is NOT found.
|
|
|
|
e.g. Locating the first typical start codon, AUG, in an RNA sequence:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_rna.index("AUG")
|
|
3
|
|
|
|
The next typical start codon can then be found by starting the search
|
|
at position 4:
|
|
|
|
>>> my_rna.index("AUG", 4)
|
|
15
|
|
|
|
This method performs the same search as the ``find`` method. However,
|
|
if the subsequence is not found, ``find`` returns -1 while ``index``
|
|
raises a ValueError:
|
|
|
|
>>> my_rna.index("T")
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: ...
|
|
>>> my_rna.find("T")
|
|
-1
|
|
|
|
See the ``search`` method to find the locations of multiple subsequences
|
|
at the same time.
|
|
"""
|
|
if isinstance(sub, MutableSeq):
|
|
sub = sub._data
|
|
elif isinstance(sub, Seq):
|
|
sub = bytes(sub)
|
|
elif isinstance(sub, str):
|
|
sub = sub.encode("ASCII")
|
|
elif not isinstance(sub, (bytes, bytearray)):
|
|
raise TypeError(
|
|
"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
|
% type(sub)
|
|
)
|
|
return self._data.index(sub, start, end)
|
|
|
|
def rindex(self, sub, start=None, end=None):
|
|
"""Return the highest index in the sequence where subsequence sub is found.
|
|
|
|
With optional arguments start and end, return the highest index in the
|
|
sequence such that the subsequence sub is contained within the sequence
|
|
region [start:end].
|
|
|
|
Arguments:
|
|
- sub - a string or another Seq or MutableSeq object to search for
|
|
- start - optional integer, slice start
|
|
- end - optional integer, slice end
|
|
|
|
Returns -1 if the subsequence is NOT found.
|
|
|
|
e.g. Locating the last typical start codon, AUG, in an RNA sequence:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_rna.rindex("AUG")
|
|
15
|
|
|
|
The location of the typical start codon before that can be found by
|
|
ending the search at position 15:
|
|
|
|
>>> my_rna.rindex("AUG", end=15)
|
|
3
|
|
|
|
This method performs the same search as the ``rfind`` method. However,
|
|
if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
|
|
raises a ValueError:
|
|
|
|
>>> my_rna.rindex("T")
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: ...
|
|
>>> my_rna.rfind("T")
|
|
-1
|
|
|
|
See the ``search`` method to find the locations of multiple subsequences
|
|
at the same time.
|
|
"""
|
|
if isinstance(sub, MutableSeq):
|
|
sub = sub._data
|
|
elif isinstance(sub, Seq):
|
|
sub = bytes(sub)
|
|
elif isinstance(sub, str):
|
|
sub = sub.encode("ASCII")
|
|
elif not isinstance(sub, (bytes, bytearray)):
|
|
raise TypeError(
|
|
"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
|
% type(sub)
|
|
)
|
|
return self._data.rindex(sub, start, end)
|
|
|
|
def search(self, subs):
|
|
"""Search the substrings subs in self and yield the index and substring found.
|
|
|
|
Arguments:
|
|
- subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
|
|
objects containing the substrings to search for.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
|
|
>>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
|
|
>>> for index, substring in matches:
|
|
... print(index, substring)
|
|
...
|
|
7 CC
|
|
9 ATTG
|
|
20 CC
|
|
34 CC
|
|
34 CCC
|
|
35 CC
|
|
"""
|
|
subdict = collections.defaultdict(set)
|
|
for index, sub in enumerate(subs):
|
|
if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
|
|
sub = bytes(sub)
|
|
elif isinstance(sub, str):
|
|
sub = sub.encode("ASCII")
|
|
elif not isinstance(sub, bytes):
|
|
raise TypeError(
|
|
"subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
|
% (index, type(sub))
|
|
)
|
|
length = len(sub)
|
|
subdict[length].add(sub)
|
|
for start in range(len(self) - 1):
|
|
for length, subs in subdict.items():
|
|
stop = start + length
|
|
for sub in subs:
|
|
if self._data[start:stop] == sub:
|
|
yield (start, sub.decode())
|
|
break
|
|
|
|
def startswith(self, prefix, start=None, end=None):
|
|
"""Return True if the sequence starts with the given prefix, False otherwise.
|
|
|
|
Return True if the sequence starts with the specified prefix
|
|
(a string or another Seq object), False otherwise.
|
|
With optional start, test sequence beginning at that position.
|
|
With optional end, stop comparing sequence at that position.
|
|
prefix can also be a tuple of strings to try. e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_rna.startswith("GUC")
|
|
True
|
|
>>> my_rna.startswith("AUG")
|
|
False
|
|
>>> my_rna.startswith("AUG", 3)
|
|
True
|
|
>>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
|
|
True
|
|
"""
|
|
if isinstance(prefix, tuple):
|
|
prefix = tuple(
|
|
bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
|
|
for p in prefix
|
|
)
|
|
elif isinstance(prefix, _SeqAbstractBaseClass):
|
|
prefix = bytes(prefix)
|
|
elif isinstance(prefix, str):
|
|
prefix = prefix.encode("ASCII")
|
|
return self._data.startswith(prefix, start, end)
|
|
|
|
def endswith(self, suffix, start=None, end=None):
|
|
"""Return True if the sequence ends with the given suffix, False otherwise.
|
|
|
|
Return True if the sequence ends with the specified suffix
|
|
(a string or another Seq object), False otherwise.
|
|
With optional start, test sequence beginning at that position.
|
|
With optional end, stop comparing sequence at that position.
|
|
suffix can also be a tuple of strings to try. e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_rna.endswith("UUG")
|
|
True
|
|
>>> my_rna.endswith("AUG")
|
|
False
|
|
>>> my_rna.endswith("AUG", 0, 18)
|
|
True
|
|
>>> my_rna.endswith(("UCC", "UCA", "UUG"))
|
|
True
|
|
"""
|
|
if isinstance(suffix, tuple):
|
|
suffix = tuple(
|
|
bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
|
|
for p in suffix
|
|
)
|
|
elif isinstance(suffix, _SeqAbstractBaseClass):
|
|
suffix = bytes(suffix)
|
|
elif isinstance(suffix, str):
|
|
suffix = suffix.encode("ASCII")
|
|
return self._data.endswith(suffix, start, end)
|
|
|
|
def split(self, sep=None, maxsplit=-1):
|
|
"""Return a list of subsequences when splitting the sequence by separator sep.
|
|
|
|
Return a list of the subsequences in the sequence (as Seq objects),
|
|
using sep as the delimiter string. If maxsplit is given, at
|
|
most maxsplit splits are done. If maxsplit is omitted, all
|
|
splits are made.
|
|
|
|
For consistency with the ``split`` method of Python strings, any
|
|
whitespace (tabs, spaces, newlines) is a separator if sep is None, the
|
|
default value
|
|
|
|
e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_aa = my_rna.translate()
|
|
>>> my_aa
|
|
Seq('VMAIVMGR*KGAR*L')
|
|
>>> for pep in my_aa.split("*"):
|
|
... pep
|
|
Seq('VMAIVMGR')
|
|
Seq('KGAR')
|
|
Seq('L')
|
|
>>> for pep in my_aa.split("*", 1):
|
|
... pep
|
|
Seq('VMAIVMGR')
|
|
Seq('KGAR*L')
|
|
|
|
See also the rsplit method, which splits the sequence starting from the
|
|
end:
|
|
|
|
>>> for pep in my_aa.rsplit("*", 1):
|
|
... pep
|
|
Seq('VMAIVMGR*KGAR')
|
|
Seq('L')
|
|
"""
|
|
if isinstance(sep, _SeqAbstractBaseClass):
|
|
sep = bytes(sep)
|
|
elif isinstance(sep, str):
|
|
sep = sep.encode("ASCII")
|
|
return [Seq(part) for part in self._data.split(sep, maxsplit)]
|
|
|
|
def rsplit(self, sep=None, maxsplit=-1):
|
|
"""Return a list of subsequences by splitting the sequence from the right.
|
|
|
|
Return a list of the subsequences in the sequence (as Seq objects),
|
|
using sep as the delimiter string. If maxsplit is given, at
|
|
most maxsplit splits are done. If maxsplit is omitted, all
|
|
splits are made.
|
|
|
|
For consistency with the ``rsplit`` method of Python strings, any
|
|
whitespace (tabs, spaces, newlines) is a separator if sep is None, the
|
|
default value
|
|
|
|
e.g.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
|
>>> my_aa = my_rna.translate()
|
|
>>> my_aa
|
|
Seq('VMAIVMGR*KGAR*L')
|
|
>>> for pep in my_aa.rsplit("*"):
|
|
... pep
|
|
Seq('VMAIVMGR')
|
|
Seq('KGAR')
|
|
Seq('L')
|
|
>>> for pep in my_aa.rsplit("*", 1):
|
|
... pep
|
|
Seq('VMAIVMGR*KGAR')
|
|
Seq('L')
|
|
|
|
See also the split method, which splits the sequence starting from the
|
|
beginning:
|
|
|
|
>>> for pep in my_aa.split("*", 1):
|
|
... pep
|
|
Seq('VMAIVMGR')
|
|
Seq('KGAR*L')
|
|
"""
|
|
if isinstance(sep, _SeqAbstractBaseClass):
|
|
sep = bytes(sep)
|
|
elif isinstance(sep, str):
|
|
sep = sep.encode("ASCII")
|
|
return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
|
|
|
|
def strip(self, chars=None, inplace=False):
|
|
"""Return a sequence object with leading and trailing ends stripped.
|
|
|
|
With default arguments, leading and trailing whitespace is removed:
|
|
|
|
>>> seq = Seq(" ACGT ")
|
|
>>> seq.strip()
|
|
Seq('ACGT')
|
|
>>> seq
|
|
Seq(' ACGT ')
|
|
|
|
If ``chars`` is given and not ``None``, remove characters in ``chars``
|
|
instead. The order of the characters to be removed is not important:
|
|
|
|
>>> Seq("ACGTACGT").strip("TGCA")
|
|
Seq('')
|
|
|
|
A copy of the sequence is returned if ``inplace`` is ``False`` (the
|
|
default value). If ``inplace`` is ``True``, the sequence is stripped
|
|
in-place and returned.
|
|
|
|
>>> seq = MutableSeq(" ACGT ")
|
|
>>> seq.strip()
|
|
MutableSeq('ACGT')
|
|
>>> seq
|
|
MutableSeq(' ACGT ')
|
|
>>> seq.strip(inplace=True)
|
|
MutableSeq('ACGT')
|
|
>>> seq
|
|
MutableSeq('ACGT')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
|
|
is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
See also the lstrip and rstrip methods.
|
|
"""
|
|
if isinstance(chars, _SeqAbstractBaseClass):
|
|
chars = bytes(chars)
|
|
elif isinstance(chars, str):
|
|
chars = chars.encode("ASCII")
|
|
try:
|
|
data = self._data.strip(chars)
|
|
except TypeError:
|
|
raise TypeError(
|
|
"argument must be None or a string, Seq, MutableSeq, or bytes-like object"
|
|
) from None
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
else:
|
|
return self.__class__(data)
|
|
|
|
def lstrip(self, chars=None, inplace=False):
|
|
"""Return a sequence object with leading and trailing ends stripped.
|
|
|
|
With default arguments, leading whitespace is removed:
|
|
|
|
>>> seq = Seq(" ACGT ")
|
|
>>> seq.lstrip()
|
|
Seq('ACGT ')
|
|
>>> seq
|
|
Seq(' ACGT ')
|
|
|
|
If ``chars`` is given and not ``None``, remove characters in ``chars``
|
|
from the leading end instead. The order of the characters to be removed
|
|
is not important:
|
|
|
|
>>> Seq("ACGACGTTACG").lstrip("GCA")
|
|
Seq('TTACG')
|
|
|
|
A copy of the sequence is returned if ``inplace`` is ``False`` (the
|
|
default value). If ``inplace`` is ``True``, the sequence is stripped
|
|
in-place and returned.
|
|
|
|
>>> seq = MutableSeq(" ACGT ")
|
|
>>> seq.lstrip()
|
|
MutableSeq('ACGT ')
|
|
>>> seq
|
|
MutableSeq(' ACGT ')
|
|
>>> seq.lstrip(inplace=True)
|
|
MutableSeq('ACGT ')
|
|
>>> seq
|
|
MutableSeq('ACGT ')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
See also the strip and rstrip methods.
|
|
"""
|
|
if isinstance(chars, _SeqAbstractBaseClass):
|
|
chars = bytes(chars)
|
|
elif isinstance(chars, str):
|
|
chars = chars.encode("ASCII")
|
|
try:
|
|
data = self._data.lstrip(chars)
|
|
except TypeError:
|
|
raise TypeError(
|
|
"argument must be None or a string, Seq, MutableSeq, or bytes-like object"
|
|
) from None
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
else:
|
|
return self.__class__(data)
|
|
|
|
def rstrip(self, chars=None, inplace=False):
|
|
"""Return a sequence object with trailing ends stripped.
|
|
|
|
With default arguments, trailing whitespace is removed:
|
|
|
|
>>> seq = Seq(" ACGT ")
|
|
>>> seq.rstrip()
|
|
Seq(' ACGT')
|
|
>>> seq
|
|
Seq(' ACGT ')
|
|
|
|
If ``chars`` is given and not ``None``, remove characters in ``chars``
|
|
from the trailing end instead. The order of the characters to be
|
|
removed is not important:
|
|
|
|
>>> Seq("ACGACGTTACG").rstrip("GCA")
|
|
Seq('ACGACGTT')
|
|
|
|
A copy of the sequence is returned if ``inplace`` is ``False`` (the
|
|
default value). If ``inplace`` is ``True``, the sequence is stripped
|
|
in-place and returned.
|
|
|
|
>>> seq = MutableSeq(" ACGT ")
|
|
>>> seq.rstrip()
|
|
MutableSeq(' ACGT')
|
|
>>> seq
|
|
MutableSeq(' ACGT ')
|
|
>>> seq.rstrip(inplace=True)
|
|
MutableSeq(' ACGT')
|
|
>>> seq
|
|
MutableSeq(' ACGT')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
See also the strip and lstrip methods.
|
|
"""
|
|
if isinstance(chars, _SeqAbstractBaseClass):
|
|
chars = bytes(chars)
|
|
elif isinstance(chars, str):
|
|
chars = chars.encode("ASCII")
|
|
try:
|
|
data = self._data.rstrip(chars)
|
|
except TypeError:
|
|
raise TypeError(
|
|
"argument must be None or a string, Seq, MutableSeq, or bytes-like object"
|
|
) from None
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
else:
|
|
return self.__class__(data)
|
|
|
|
def removeprefix(self, prefix, inplace=False):
|
|
"""Return a new Seq object with prefix (left) removed.
|
|
|
|
This behaves like the python string method of the same name.
|
|
|
|
e.g. Removing a start Codon:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_seq = Seq("ATGGTGTGTGT")
|
|
>>> my_seq
|
|
Seq('ATGGTGTGTGT')
|
|
>>> my_seq.removeprefix('ATG')
|
|
Seq('GTGTGTGT')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
See also the removesuffix method.
|
|
"""
|
|
if isinstance(prefix, _SeqAbstractBaseClass):
|
|
prefix = bytes(prefix)
|
|
elif isinstance(prefix, str):
|
|
prefix = prefix.encode("ASCII")
|
|
try:
|
|
data = self._data.removeprefix(prefix)
|
|
except TypeError:
|
|
raise TypeError(
|
|
"argument must be a string, Seq, MutableSeq, or bytes-like object"
|
|
) from None
|
|
except AttributeError:
|
|
# Fall back for pre-Python 3.9
|
|
data = self._data
|
|
if data.startswith(prefix):
|
|
data = data[len(prefix) :]
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
else:
|
|
return self.__class__(data)
|
|
|
|
def removesuffix(self, suffix, inplace=False):
|
|
"""Return a new Seq object with suffix (right) removed.
|
|
|
|
This behaves like the python string method of the same name.
|
|
|
|
e.g. Removing a stop codon:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_seq = Seq("GTGTGTGTTAG")
|
|
>>> my_seq
|
|
Seq('GTGTGTGTTAG')
|
|
>>> stop_codon = Seq("TAG")
|
|
>>> my_seq.removesuffix(stop_codon)
|
|
Seq('GTGTGTGT')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
See also the removeprefix method.
|
|
"""
|
|
if isinstance(suffix, _SeqAbstractBaseClass):
|
|
suffix = bytes(suffix)
|
|
elif isinstance(suffix, str):
|
|
suffix = suffix.encode("ASCII")
|
|
try:
|
|
data = self._data.removesuffix(suffix)
|
|
except TypeError:
|
|
raise TypeError(
|
|
"argument must be a string, Seq, MutableSeq, or bytes-like object"
|
|
) from None
|
|
except AttributeError:
|
|
# Fall back for pre-Python 3.9
|
|
data = self._data
|
|
if data.endswith(suffix):
|
|
data = data[: -len(suffix)]
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
else:
|
|
return self.__class__(data)
|
|
|
|
def upper(self, inplace=False):
|
|
"""Return the sequence in upper case.
|
|
|
|
An upper-case copy of the sequence is returned if inplace is False,
|
|
the default value:
|
|
|
|
>>> from Bio.Seq import Seq, MutableSeq
|
|
>>> my_seq = Seq("VHLTPeeK*")
|
|
>>> my_seq
|
|
Seq('VHLTPeeK*')
|
|
>>> my_seq.lower()
|
|
Seq('vhltpeek*')
|
|
>>> my_seq.upper()
|
|
Seq('VHLTPEEK*')
|
|
>>> my_seq
|
|
Seq('VHLTPeeK*')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("VHLTPeeK*")
|
|
>>> my_seq
|
|
MutableSeq('VHLTPeeK*')
|
|
>>> my_seq.lower()
|
|
MutableSeq('vhltpeek*')
|
|
>>> my_seq.upper()
|
|
MutableSeq('VHLTPEEK*')
|
|
>>> my_seq
|
|
MutableSeq('VHLTPeeK*')
|
|
|
|
>>> my_seq.lower(inplace=True)
|
|
MutableSeq('vhltpeek*')
|
|
>>> my_seq
|
|
MutableSeq('vhltpeek*')
|
|
>>> my_seq.upper(inplace=True)
|
|
MutableSeq('VHLTPEEK*')
|
|
>>> my_seq
|
|
MutableSeq('VHLTPEEK*')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``upper`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
See also the ``lower`` method.
|
|
"""
|
|
data = self._data.upper()
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
else:
|
|
return self.__class__(data)
|
|
|
|
def lower(self, inplace=False):
|
|
"""Return the sequence in lower case.
|
|
|
|
An lower-case copy of the sequence is returned if inplace is False,
|
|
the default value:
|
|
|
|
>>> from Bio.Seq import Seq, MutableSeq
|
|
>>> my_seq = Seq("VHLTPeeK*")
|
|
>>> my_seq
|
|
Seq('VHLTPeeK*')
|
|
>>> my_seq.lower()
|
|
Seq('vhltpeek*')
|
|
>>> my_seq.upper()
|
|
Seq('VHLTPEEK*')
|
|
>>> my_seq
|
|
Seq('VHLTPeeK*')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("VHLTPeeK*")
|
|
>>> my_seq
|
|
MutableSeq('VHLTPeeK*')
|
|
>>> my_seq.lower()
|
|
MutableSeq('vhltpeek*')
|
|
>>> my_seq.upper()
|
|
MutableSeq('VHLTPEEK*')
|
|
>>> my_seq
|
|
MutableSeq('VHLTPeeK*')
|
|
|
|
>>> my_seq.lower(inplace=True)
|
|
MutableSeq('vhltpeek*')
|
|
>>> my_seq
|
|
MutableSeq('vhltpeek*')
|
|
>>> my_seq.upper(inplace=True)
|
|
MutableSeq('VHLTPEEK*')
|
|
>>> my_seq
|
|
MutableSeq('VHLTPEEK*')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``lower`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
See also the ``upper`` method.
|
|
"""
|
|
data = self._data.lower()
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
else:
|
|
return self.__class__(data)
|
|
|
|
def isupper(self):
|
|
"""Return True if all ASCII characters in data are uppercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
return self._data.isupper()
|
|
|
|
def islower(self):
|
|
"""Return True if all ASCII characters in data are lowercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
return self._data.islower()
|
|
|
|
def translate(
|
|
self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
|
|
):
|
|
"""Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
|
|
|
|
This method will translate DNA or RNA sequences. It should not
|
|
be used on protein sequences as any result will be biologically
|
|
meaningless.
|
|
|
|
Arguments:
|
|
- table - Which codon table to use? This can be either a name
|
|
(string), an NCBI identifier (integer), or a CodonTable
|
|
object (useful for non-standard genetic codes). This
|
|
defaults to the "Standard" table.
|
|
- stop_symbol - Single character string, what to use for
|
|
terminators. This defaults to the asterisk, "*".
|
|
- to_stop - Boolean, defaults to False meaning do a full
|
|
translation continuing on past any stop codons (translated as the
|
|
specified stop_symbol). If True, translation is terminated at
|
|
the first in frame stop codon (and the stop_symbol is not
|
|
appended to the returned protein sequence).
|
|
- cds - Boolean, indicates this is a complete CDS. If True,
|
|
this checks the sequence starts with a valid alternative start
|
|
codon (which will be translated as methionine, M), that the
|
|
sequence length is a multiple of three, and that there is a
|
|
single in frame stop codon at the end (this will be excluded
|
|
from the protein sequence, regardless of the to_stop option).
|
|
If these tests fail, an exception is raised.
|
|
- gap - Single character string to denote symbol used for gaps.
|
|
Defaults to the minus sign.
|
|
|
|
A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
|
|
object; a ``MutableSeq`` object is returned if ``translate`` is called
|
|
pn a ``MutableSeq`` object.
|
|
|
|
e.g. Using the standard table:
|
|
|
|
>>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
|
|
>>> coding_dna.translate()
|
|
Seq('VAIVMGR*KGAR*')
|
|
>>> coding_dna.translate(stop_symbol="@")
|
|
Seq('VAIVMGR@KGAR@')
|
|
>>> coding_dna.translate(to_stop=True)
|
|
Seq('VAIVMGR')
|
|
|
|
Now using NCBI table 2, where TGA is not a stop codon:
|
|
|
|
>>> coding_dna.translate(table=2)
|
|
Seq('VAIVMGRWKGAR*')
|
|
>>> coding_dna.translate(table=2, to_stop=True)
|
|
Seq('VAIVMGRWKGAR')
|
|
|
|
In fact, GTG is an alternative start codon under NCBI table 2, meaning
|
|
this sequence could be a complete CDS:
|
|
|
|
>>> coding_dna.translate(table=2, cds=True)
|
|
Seq('MAIVMGRWKGAR')
|
|
|
|
It isn't a valid CDS under NCBI table 1, due to both the start codon
|
|
and also the in frame stop codons:
|
|
|
|
>>> coding_dna.translate(table=1, cds=True)
|
|
Traceback (most recent call last):
|
|
...
|
|
Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
|
|
|
|
If the sequence has no in-frame stop codon, then the to_stop argument
|
|
has no effect:
|
|
|
|
>>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
|
|
>>> coding_dna2.translate()
|
|
Seq('LAIVMGR')
|
|
>>> coding_dna2.translate(to_stop=True)
|
|
Seq('LAIVMGR')
|
|
|
|
NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
|
|
or a stop codon. These are translated as "X". Any invalid codon
|
|
(e.g. "TA?" or "T-A") will throw a TranslationError.
|
|
|
|
NOTE - This does NOT behave like the python string's translate
|
|
method. For that use str(my_seq).translate(...) instead
|
|
"""
|
|
try:
|
|
data = str(self)
|
|
except UndefinedSequenceError:
|
|
# translating an undefined sequence yields an undefined
|
|
# sequence with the length divided by 3
|
|
n = len(self)
|
|
if n % 3 != 0:
|
|
warnings.warn(
|
|
"Partial codon, len(sequence) not a multiple of three. "
|
|
"This may become an error in future.",
|
|
BiopythonWarning,
|
|
)
|
|
return Seq(None, n // 3)
|
|
|
|
return self.__class__(
|
|
_translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
|
|
)
|
|
|
|
def complement(self, inplace=False):
|
|
"""Return the complement as a DNA sequence.
|
|
|
|
>>> Seq("CGA").complement()
|
|
Seq('GCT')
|
|
|
|
Any U in the sequence is treated as a T:
|
|
|
|
>>> Seq("CGAUT").complement()
|
|
Seq('GCTAA')
|
|
|
|
In contrast, ``complement_rna`` returns an RNA sequence:
|
|
|
|
>>> Seq("CGAUT").complement_rna()
|
|
Seq('GCUAA')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
>>> my_seq.complement()
|
|
MutableSeq('GCT')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
>>> my_seq.complement(inplace=True)
|
|
MutableSeq('GCT')
|
|
>>> my_seq
|
|
MutableSeq('GCT')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
|
|
"""
|
|
ttable = _dna_complement_table
|
|
try:
|
|
data = self._data.translate(ttable)
|
|
except UndefinedSequenceError:
|
|
# complement of an undefined sequence is an undefined sequence
|
|
# of the same length
|
|
return self
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
return self.__class__(data)
|
|
|
|
def complement_rna(self, inplace=False):
|
|
"""Return the complement as an RNA sequence.
|
|
|
|
>>> Seq("CGA").complement_rna()
|
|
Seq('GCU')
|
|
|
|
Any T in the sequence is treated as a U:
|
|
|
|
>>> Seq("CGAUT").complement_rna()
|
|
Seq('GCUAA')
|
|
|
|
In contrast, ``complement`` returns a DNA sequence by default:
|
|
|
|
>>> Seq("CGA").complement()
|
|
Seq('GCT')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
>>> my_seq.complement_rna()
|
|
MutableSeq('GCU')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
>>> my_seq.complement_rna(inplace=True)
|
|
MutableSeq('GCU')
|
|
>>> my_seq
|
|
MutableSeq('GCU')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
|
|
"""
|
|
try:
|
|
data = self._data.translate(_rna_complement_table)
|
|
except UndefinedSequenceError:
|
|
# complement of an undefined sequence is an undefined sequence
|
|
# of the same length
|
|
return self
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
return self.__class__(data)
|
|
|
|
def reverse_complement(self, inplace=False):
|
|
"""Return the reverse complement as a DNA sequence.
|
|
|
|
>>> Seq("CGA").reverse_complement()
|
|
Seq('TCG')
|
|
|
|
Any U in the sequence is treated as a T:
|
|
|
|
>>> Seq("CGAUT").reverse_complement()
|
|
Seq('AATCG')
|
|
|
|
In contrast, ``reverse_complement_rna`` returns an RNA sequence:
|
|
|
|
>>> Seq("CGA").reverse_complement_rna()
|
|
Seq('UCG')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
>>> my_seq.reverse_complement()
|
|
MutableSeq('TCG')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
>>> my_seq.reverse_complement(inplace=True)
|
|
MutableSeq('TCG')
|
|
>>> my_seq
|
|
MutableSeq('TCG')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``reverse_complement`` is called on a ``Seq`` object with
|
|
``inplace=True``.
|
|
"""
|
|
try:
|
|
data = self._data.translate(_dna_complement_table)
|
|
except UndefinedSequenceError:
|
|
# reverse complement of an undefined sequence is an undefined sequence
|
|
# of the same length
|
|
return self
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[::-1] = data
|
|
return self
|
|
return self.__class__(data[::-1])
|
|
|
|
def reverse_complement_rna(self, inplace=False):
|
|
"""Return the reverse complement as an RNA sequence.
|
|
|
|
>>> Seq("CGA").reverse_complement_rna()
|
|
Seq('UCG')
|
|
|
|
Any T in the sequence is treated as a U:
|
|
|
|
>>> Seq("CGAUT").reverse_complement_rna()
|
|
Seq('AAUCG')
|
|
|
|
In contrast, ``reverse_complement`` returns a DNA sequence:
|
|
|
|
>>> Seq("CGA").reverse_complement()
|
|
Seq('TCG')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
>>> my_seq.reverse_complement_rna()
|
|
MutableSeq('UCG')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
>>> my_seq.reverse_complement_rna(inplace=True)
|
|
MutableSeq('UCG')
|
|
>>> my_seq
|
|
MutableSeq('UCG')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``reverse_complement_rna`` is called on a ``Seq`` object with
|
|
``inplace=True``.
|
|
"""
|
|
try:
|
|
data = self._data.translate(_rna_complement_table)
|
|
except UndefinedSequenceError:
|
|
# reverse complement of an undefined sequence is an undefined sequence
|
|
# of the same length
|
|
return self
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[::-1] = data
|
|
return self
|
|
return self.__class__(data[::-1])
|
|
|
|
def transcribe(self, inplace=False):
|
|
"""Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
|
|
|
|
Following the usual convention, the sequence is interpreted as the
|
|
coding strand of the DNA double helix, not the template strand. This
|
|
means we can get the RNA sequence just by switching T to U.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
|
|
>>> coding_dna
|
|
Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
|
>>> coding_dna.transcribe()
|
|
Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
|
|
>>> sequence
|
|
MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
|
>>> sequence.transcribe()
|
|
MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
|
>>> sequence
|
|
MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
|
|
|
>>> sequence.transcribe(inplace=True)
|
|
MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
|
>>> sequence
|
|
MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
Trying to transcribe an RNA sequence has no effect.
|
|
If you have a nucleotide sequence which might be DNA or RNA
|
|
(or even a mixture), calling the transcribe method will ensure
|
|
any T becomes U.
|
|
|
|
Trying to transcribe a protein sequence will replace any
|
|
T for Threonine with U for Selenocysteine, which has no
|
|
biologically plausible rational.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_protein = Seq("MAIVMGRT")
|
|
>>> my_protein.transcribe()
|
|
Seq('MAIVMGRU')
|
|
"""
|
|
data = self._data.replace(b"T", b"U").replace(b"t", b"u")
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
return self.__class__(data)
|
|
|
|
def back_transcribe(self, inplace=False):
|
|
"""Return the DNA sequence from an RNA sequence by creating a new Seq object.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
|
|
>>> messenger_rna
|
|
Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
|
>>> messenger_rna.back_transcribe()
|
|
Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
|
|
>>> sequence
|
|
MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
|
>>> sequence.back_transcribe()
|
|
MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
|
>>> sequence
|
|
MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
|
|
|
>>> sequence.back_transcribe(inplace=True)
|
|
MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
|
>>> sequence
|
|
MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
|
|
|
|
Trying to back-transcribe DNA has no effect, If you have a nucleotide
|
|
sequence which might be DNA or RNA (or even a mixture), calling the
|
|
back-transcribe method will ensure any U becomes T.
|
|
|
|
Trying to back-transcribe a protein sequence will replace any U for
|
|
Selenocysteine with T for Threonine, which is biologically meaningless.
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_protein = Seq("MAIVMGRU")
|
|
>>> my_protein.back_transcribe()
|
|
Seq('MAIVMGRT')
|
|
"""
|
|
data = self._data.replace(b"U", b"T").replace(b"u", b"t")
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
return self.__class__(data)
|
|
|
|
def join(self, other):
|
|
"""Return a merge of the sequences in other, spaced by the sequence from self.
|
|
|
|
Accepts a Seq object, MutableSeq object, or string (and iterates over
|
|
the letters), or an iterable containing Seq, MutableSeq, or string
|
|
objects. These arguments will be concatenated with the calling sequence
|
|
as the spacer:
|
|
|
|
>>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
|
|
>>> concatenated
|
|
Seq('AAANNNNNTTTNNNNNPPP')
|
|
|
|
Joining the letters of a single sequence:
|
|
|
|
>>> Seq('NNNNN').join(Seq("ACGT"))
|
|
Seq('ANNNNNCNNNNNGNNNNNT')
|
|
>>> Seq('NNNNN').join("ACGT")
|
|
Seq('ANNNNNCNNNNNGNNNNNT')
|
|
"""
|
|
if isinstance(other, _SeqAbstractBaseClass):
|
|
return self.__class__(str(self).join(str(other)))
|
|
elif isinstance(other, str):
|
|
return self.__class__(str(self).join(other))
|
|
|
|
from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
|
|
|
if isinstance(other, SeqRecord):
|
|
raise TypeError("Iterable cannot be a SeqRecord")
|
|
|
|
for c in other:
|
|
if isinstance(c, SeqRecord):
|
|
raise TypeError("Iterable cannot contain SeqRecords")
|
|
elif not isinstance(c, (str, _SeqAbstractBaseClass)):
|
|
raise TypeError(
|
|
"Input must be an iterable of Seq objects, MutableSeq objects, or strings"
|
|
)
|
|
return self.__class__(str(self).join([str(_) for _ in other]))
|
|
|
|
def replace(self, old, new, inplace=False):
|
|
"""Return a copy with all occurrences of subsequence old replaced by new.
|
|
|
|
>>> s = Seq("ACGTAACCGGTT")
|
|
>>> t = s.replace("AC", "XYZ")
|
|
>>> s
|
|
Seq('ACGTAACCGGTT')
|
|
>>> t
|
|
Seq('XYZGTAXYZCGGTT')
|
|
|
|
For mutable sequences, passing inplace=True will modify the sequence in place:
|
|
|
|
>>> m = MutableSeq("ACGTAACCGGTT")
|
|
>>> t = m.replace("AC", "XYZ")
|
|
>>> m
|
|
MutableSeq('ACGTAACCGGTT')
|
|
>>> t
|
|
MutableSeq('XYZGTAXYZCGGTT')
|
|
|
|
>>> m = MutableSeq("ACGTAACCGGTT")
|
|
>>> t = m.replace("AC", "XYZ", inplace=True)
|
|
>>> m
|
|
MutableSeq('XYZGTAXYZCGGTT')
|
|
>>> t
|
|
MutableSeq('XYZGTAXYZCGGTT')
|
|
|
|
As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
|
``replace`` is called on a ``Seq`` object with ``inplace=True``.
|
|
"""
|
|
if isinstance(old, _SeqAbstractBaseClass):
|
|
old = bytes(old)
|
|
elif isinstance(old, str):
|
|
old = old.encode("ASCII")
|
|
if isinstance(new, _SeqAbstractBaseClass):
|
|
new = bytes(new)
|
|
elif isinstance(new, str):
|
|
new = new.encode("ASCII")
|
|
data = self._data.replace(old, new)
|
|
if inplace:
|
|
if not isinstance(self._data, bytearray):
|
|
raise TypeError("Sequence is immutable")
|
|
self._data[:] = data
|
|
return self
|
|
return self.__class__(data)
|
|
|
|
@property
|
|
def defined(self):
|
|
"""Return True if the sequence is defined, False if undefined or partially defined.
|
|
|
|
Zero-length sequences are always considered to be defined.
|
|
"""
|
|
if isinstance(self._data, (bytes, bytearray)):
|
|
return True
|
|
else:
|
|
return self._data.defined
|
|
|
|
@property
|
|
def defined_ranges(self):
|
|
"""Return a tuple of the ranges where the sequence contents is defined.
|
|
|
|
The return value has the format ((start1, end1), (start2, end2), ...).
|
|
"""
|
|
if isinstance(self._data, (bytes, bytearray)):
|
|
length = len(self)
|
|
if length > 0:
|
|
return ((0, length),)
|
|
else:
|
|
return ()
|
|
else:
|
|
return self._data.defined_ranges
|
|
|
|
|
|
class Seq(_SeqAbstractBaseClass):
|
|
"""Read-only sequence object (essentially a string with biological methods).
|
|
|
|
Like normal python strings, our basic sequence object is immutable.
|
|
This prevents you from doing my_seq[5] = "A" for example, but does allow
|
|
Seq objects to be used as dictionary keys.
|
|
|
|
The Seq object provides a number of string like methods (such as count,
|
|
find, split and strip).
|
|
|
|
The Seq object also provides some biological methods, such as complement,
|
|
reverse_complement, transcribe, back_transcribe and translate (which are
|
|
not applicable to protein sequences).
|
|
"""
|
|
|
|
_data: Union[bytes, SequenceDataAbstractBaseClass]
|
|
|
|
def __init__(
|
|
self,
|
|
data: Union[
|
|
str,
|
|
bytes,
|
|
bytearray,
|
|
_SeqAbstractBaseClass,
|
|
SequenceDataAbstractBaseClass,
|
|
dict,
|
|
None,
|
|
],
|
|
length: Optional[int] = None,
|
|
):
|
|
"""Create a Seq object.
|
|
|
|
Arguments:
|
|
- data - Sequence, required (string)
|
|
- length - Sequence length, used only if data is None or a dictionary (integer)
|
|
|
|
You will typically use Bio.SeqIO to read in sequences from files as
|
|
SeqRecord objects, whose sequence will be exposed as a Seq object via
|
|
the seq property.
|
|
|
|
However, you can also create a Seq object directly:
|
|
|
|
>>> from Bio.Seq import Seq
|
|
>>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
|
|
>>> my_seq
|
|
Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
|
|
>>> print(my_seq)
|
|
MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
|
|
|
|
To create a Seq object with for a sequence of known length but
|
|
unknown sequence contents, use None for the data argument and pass
|
|
the sequence length for the length argument. Trying to access the
|
|
sequence contents of a Seq object created in this way will raise
|
|
an UndefinedSequenceError:
|
|
|
|
>>> my_undefined_sequence = Seq(None, 20)
|
|
>>> my_undefined_sequence
|
|
Seq(None, length=20)
|
|
>>> len(my_undefined_sequence)
|
|
20
|
|
>>> print(my_undefined_sequence)
|
|
Traceback (most recent call last):
|
|
...
|
|
Bio.Seq.UndefinedSequenceError: Sequence content is undefined
|
|
|
|
If the sequence contents is known for parts of the sequence only, use
|
|
a dictionary for the data argument to pass the known sequence segments:
|
|
|
|
>>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
|
|
>>> my_partially_defined_sequence
|
|
Seq({3: 'ACGT'}, length=10)
|
|
>>> len(my_partially_defined_sequence)
|
|
10
|
|
>>> print(my_partially_defined_sequence)
|
|
Traceback (most recent call last):
|
|
...
|
|
Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
|
|
>>> my_partially_defined_sequence[3:7]
|
|
Seq('ACGT')
|
|
>>> print(my_partially_defined_sequence[3:7])
|
|
ACGT
|
|
"""
|
|
if data is None:
|
|
if length is None:
|
|
raise ValueError("length must not be None if data is None")
|
|
elif length == 0:
|
|
self._data = b""
|
|
elif length < 0:
|
|
raise ValueError("length must not be negative.")
|
|
else:
|
|
self._data = _UndefinedSequenceData(length)
|
|
elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
|
|
self._data = data
|
|
elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
|
|
self._data = bytes(data)
|
|
elif isinstance(data, str):
|
|
self._data = bytes(data, encoding="ASCII")
|
|
elif isinstance(data, dict):
|
|
if length is None:
|
|
raise ValueError("length must not be None if data is a dictionary")
|
|
elif length == 0:
|
|
self._data = b""
|
|
elif length < 0:
|
|
raise ValueError("length must not be negative.")
|
|
else:
|
|
current = 0 # not needed here, but it keeps mypy happy
|
|
end = -1
|
|
starts = sorted(data.keys())
|
|
_data: dict[int, bytes] = {}
|
|
for start in starts:
|
|
seq = data[start]
|
|
if isinstance(seq, str):
|
|
seq = bytes(seq, encoding="ASCII")
|
|
else:
|
|
try:
|
|
seq = bytes(seq)
|
|
except Exception:
|
|
raise ValueError("Expected bytes-like objects or strings")
|
|
if start < end:
|
|
raise ValueError("Sequence data are overlapping.")
|
|
elif start == end:
|
|
_data[current] += seq # noqa: F821
|
|
else:
|
|
_data[start] = seq
|
|
current = start
|
|
end = start + len(seq)
|
|
if end > length:
|
|
raise ValueError(
|
|
"Provided sequence data extend beyond sequence length."
|
|
)
|
|
elif end == length and current == 0:
|
|
# sequence is fully defined
|
|
self._data = _data[current]
|
|
else:
|
|
self._data = _PartiallyDefinedSequenceData(length, _data)
|
|
else:
|
|
raise TypeError(
|
|
"data should be a string, bytes, bytearray, Seq, or MutableSeq object"
|
|
)
|
|
|
|
def __hash__(self):
|
|
"""Hash of the sequence as a string for comparison.
|
|
|
|
See Seq object comparison documentation (method ``__eq__`` in
|
|
particular) as this has changed in Biopython 1.65. Older versions
|
|
would hash on object identity.
|
|
"""
|
|
return hash(self._data)
|
|
|
|
|
|
class MutableSeq(_SeqAbstractBaseClass):
|
|
"""An editable sequence object.
|
|
|
|
Unlike normal python strings and our basic sequence object (the Seq class)
|
|
which are immutable, the MutableSeq lets you edit the sequence in place.
|
|
However, this means you cannot use a MutableSeq object as a dictionary key.
|
|
|
|
>>> from Bio.Seq import MutableSeq
|
|
>>> my_seq = MutableSeq("ACTCGTCGTCG")
|
|
>>> my_seq
|
|
MutableSeq('ACTCGTCGTCG')
|
|
>>> my_seq[5]
|
|
'T'
|
|
>>> my_seq[5] = "A"
|
|
>>> my_seq
|
|
MutableSeq('ACTCGACGTCG')
|
|
>>> my_seq[5]
|
|
'A'
|
|
>>> my_seq[5:8] = "NNN"
|
|
>>> my_seq
|
|
MutableSeq('ACTCGNNNTCG')
|
|
>>> len(my_seq)
|
|
11
|
|
|
|
Note that the MutableSeq object does not support as many string-like
|
|
or biological methods as the Seq object.
|
|
"""
|
|
|
|
def __init__(self, data):
|
|
"""Create a MutableSeq object."""
|
|
if isinstance(data, bytearray):
|
|
self._data = data
|
|
elif isinstance(data, bytes):
|
|
self._data = bytearray(data)
|
|
elif isinstance(data, str):
|
|
self._data = bytearray(data, "ASCII")
|
|
elif isinstance(data, MutableSeq):
|
|
self._data = data._data[:] # Take a copy
|
|
elif isinstance(data, Seq):
|
|
# Make no assumptions about the Seq subclass internal storage
|
|
self._data = bytearray(bytes(data))
|
|
else:
|
|
raise TypeError(
|
|
"data should be a string, bytearray object, Seq object, or a "
|
|
"MutableSeq object"
|
|
)
|
|
|
|
def __setitem__(self, index, value):
|
|
"""Set a subsequence of single letter via value parameter.
|
|
|
|
>>> my_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> my_seq[0] = 'T'
|
|
>>> my_seq
|
|
MutableSeq('TCTCGACGTCG')
|
|
"""
|
|
if isinstance(index, numbers.Integral):
|
|
# Replacing a single letter with a new string
|
|
self._data[index] = ord(value)
|
|
else:
|
|
# Replacing a sub-sequence
|
|
if isinstance(value, MutableSeq):
|
|
self._data[index] = value._data
|
|
elif isinstance(value, Seq):
|
|
self._data[index] = bytes(value)
|
|
elif isinstance(value, str):
|
|
self._data[index] = value.encode("ASCII")
|
|
else:
|
|
raise TypeError(f"received unexpected type '{type(value).__name__}'")
|
|
|
|
def __delitem__(self, index):
|
|
"""Delete a subsequence of single letter.
|
|
|
|
>>> my_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> del my_seq[0]
|
|
>>> my_seq
|
|
MutableSeq('CTCGACGTCG')
|
|
"""
|
|
# Could be deleting a single letter, or a slice
|
|
del self._data[index]
|
|
|
|
def append(self, c):
|
|
"""Add a subsequence to the mutable sequence object.
|
|
|
|
>>> my_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> my_seq.append('A')
|
|
>>> my_seq
|
|
MutableSeq('ACTCGACGTCGA')
|
|
|
|
No return value.
|
|
"""
|
|
self._data.append(ord(c.encode("ASCII")))
|
|
|
|
def insert(self, i, c):
|
|
"""Add a subsequence to the mutable sequence object at a given index.
|
|
|
|
>>> my_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> my_seq.insert(0,'A')
|
|
>>> my_seq
|
|
MutableSeq('AACTCGACGTCG')
|
|
>>> my_seq.insert(8,'G')
|
|
>>> my_seq
|
|
MutableSeq('AACTCGACGGTCG')
|
|
|
|
No return value.
|
|
"""
|
|
self._data.insert(i, ord(c.encode("ASCII")))
|
|
|
|
def pop(self, i=(-1)):
|
|
"""Remove a subsequence of a single letter at given index.
|
|
|
|
>>> my_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> my_seq.pop()
|
|
'G'
|
|
>>> my_seq
|
|
MutableSeq('ACTCGACGTC')
|
|
>>> my_seq.pop()
|
|
'C'
|
|
>>> my_seq
|
|
MutableSeq('ACTCGACGT')
|
|
|
|
Returns the last character of the sequence.
|
|
"""
|
|
c = self._data[i]
|
|
del self._data[i]
|
|
return chr(c)
|
|
|
|
def remove(self, item):
|
|
"""Remove a subsequence of a single letter from mutable sequence.
|
|
|
|
>>> my_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> my_seq.remove('C')
|
|
>>> my_seq
|
|
MutableSeq('ATCGACGTCG')
|
|
>>> my_seq.remove('A')
|
|
>>> my_seq
|
|
MutableSeq('TCGACGTCG')
|
|
|
|
No return value.
|
|
"""
|
|
codepoint = ord(item)
|
|
try:
|
|
self._data.remove(codepoint)
|
|
except ValueError:
|
|
raise ValueError("value not found in MutableSeq") from None
|
|
|
|
def reverse(self):
|
|
"""Modify the mutable sequence to reverse itself.
|
|
|
|
No return value.
|
|
"""
|
|
self._data.reverse()
|
|
|
|
def extend(self, other):
|
|
"""Add a sequence to the original mutable sequence object.
|
|
|
|
>>> my_seq = MutableSeq('ACTCGACGTCG')
|
|
>>> my_seq.extend('A')
|
|
>>> my_seq
|
|
MutableSeq('ACTCGACGTCGA')
|
|
>>> my_seq.extend('TTT')
|
|
>>> my_seq
|
|
MutableSeq('ACTCGACGTCGATTT')
|
|
|
|
No return value.
|
|
"""
|
|
if isinstance(other, MutableSeq):
|
|
self._data.extend(other._data)
|
|
elif isinstance(other, Seq):
|
|
self._data.extend(bytes(other))
|
|
elif isinstance(other, str):
|
|
self._data.extend(other.encode("ASCII"))
|
|
else:
|
|
raise TypeError("expected a string, Seq or MutableSeq")
|
|
|
|
|
|
class UndefinedSequenceError(ValueError):
|
|
"""Sequence contents is undefined."""
|
|
|
|
|
|
class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
|
|
"""Stores the length of a sequence with an undefined sequence contents (PRIVATE).
|
|
|
|
Objects of this class can be used to create a Seq object to represent
|
|
sequences with a known length, but an unknown sequence contents.
|
|
Calling __len__ returns the sequence length, calling __getitem__ raises an
|
|
UndefinedSequenceError except for requests of zero size, for which it
|
|
returns an empty bytes object.
|
|
"""
|
|
|
|
__slots__ = ("_length",)
|
|
|
|
def __init__(self, length):
|
|
"""Initialize the object with the sequence length.
|
|
|
|
The calling function is responsible for ensuring that the length is
|
|
greater than zero.
|
|
"""
|
|
self._length = length
|
|
super().__init__()
|
|
|
|
def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
|
|
if isinstance(key, slice):
|
|
start, end, step = key.indices(self._length)
|
|
size = len(range(start, end, step))
|
|
if size == 0:
|
|
return b""
|
|
return _UndefinedSequenceData(size)
|
|
else:
|
|
raise UndefinedSequenceError("Sequence content is undefined")
|
|
|
|
def __len__(self):
|
|
return self._length
|
|
|
|
def __bytes__(self):
|
|
raise UndefinedSequenceError("Sequence content is undefined")
|
|
|
|
def __add__(self, other):
|
|
length = len(self) + len(other)
|
|
try:
|
|
other = bytes(other)
|
|
except UndefinedSequenceError:
|
|
if isinstance(other, _UndefinedSequenceData):
|
|
return _UndefinedSequenceData(length)
|
|
else:
|
|
return NotImplemented
|
|
# _PartiallyDefinedSequenceData.__radd__ will handle this
|
|
else:
|
|
data = {len(self): other}
|
|
return _PartiallyDefinedSequenceData(length, data)
|
|
|
|
def __radd__(self, other):
|
|
data = {0: bytes(other)}
|
|
length = len(other) + len(self)
|
|
return _PartiallyDefinedSequenceData(length, data)
|
|
|
|
def upper(self):
|
|
"""Return an upper case copy of the sequence."""
|
|
# An upper case copy of an undefined sequence is an undefined
|
|
# sequence of the same length
|
|
return _UndefinedSequenceData(self._length)
|
|
|
|
def lower(self):
|
|
"""Return a lower case copy of the sequence."""
|
|
# A lower case copy of an undefined sequence is an undefined
|
|
# sequence of the same length
|
|
return _UndefinedSequenceData(self._length)
|
|
|
|
def isupper(self):
|
|
"""Return True if all ASCII characters in data are uppercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
# Character case is irrelevant for an undefined sequence
|
|
raise UndefinedSequenceError("Sequence content is undefined")
|
|
|
|
def islower(self):
|
|
"""Return True if all ASCII characters in data are lowercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
# Character case is irrelevant for an undefined sequence
|
|
raise UndefinedSequenceError("Sequence content is undefined")
|
|
|
|
def replace(self, old, new):
|
|
"""Return a copy with all occurrences of substring old replaced by new."""
|
|
# Replacing substring old by new in an undefined sequence will result
|
|
# in an undefined sequence of the same length, if old and new have the
|
|
# number of characters.
|
|
if len(old) != len(new):
|
|
raise UndefinedSequenceError("Sequence content is undefined")
|
|
return _UndefinedSequenceData(self._length)
|
|
|
|
@property
|
|
def defined(self):
|
|
"""Return False, as the sequence is not defined and has a non-zero length."""
|
|
return False
|
|
|
|
@property
|
|
def defined_ranges(self):
|
|
"""Return a tuple of the ranges where the sequence contents is defined.
|
|
|
|
As the sequence contents of an _UndefinedSequenceData object is fully
|
|
undefined, the return value is always an empty tuple.
|
|
"""
|
|
return ()
|
|
|
|
|
|
class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
|
|
"""Stores the length of a sequence with an undefined sequence contents (PRIVATE).
|
|
|
|
Objects of this class can be used to create a Seq object to represent
|
|
sequences with a known length, but with a sequence contents that is only
|
|
partially known.
|
|
Calling __len__ returns the sequence length, calling __getitem__ returns
|
|
the sequence contents if known, otherwise an UndefinedSequenceError is
|
|
raised.
|
|
"""
|
|
|
|
__slots__ = ("_length", "_data")
|
|
|
|
def __init__(self, length, data):
|
|
"""Initialize with the sequence length and defined sequence segments.
|
|
|
|
The calling function is responsible for ensuring that the length is
|
|
greater than zero.
|
|
"""
|
|
self._length = length
|
|
self._data = data
|
|
super().__init__()
|
|
|
|
def __getitem__(
|
|
self, key: Union[slice, int]
|
|
) -> Union[bytes, SequenceDataAbstractBaseClass]:
|
|
if isinstance(key, slice):
|
|
start, end, step = key.indices(self._length)
|
|
size = len(range(start, end, step))
|
|
if size == 0:
|
|
return b""
|
|
data = {}
|
|
for s, d in self._data.items():
|
|
indices = range(-s, -s + self._length)[key]
|
|
e: Optional[int] = indices.stop
|
|
assert e is not None
|
|
if step > 0:
|
|
if e <= 0:
|
|
continue
|
|
if indices.start < 0:
|
|
s = indices.start % step
|
|
else:
|
|
s = indices.start
|
|
else: # step < 0
|
|
if e < 0:
|
|
e = None
|
|
end = len(d) - 1
|
|
if indices.start > end:
|
|
s = end + (indices.start - end) % step
|
|
else:
|
|
s = indices.start
|
|
if s < 0:
|
|
continue
|
|
start = (s - indices.start) // step
|
|
d = d[s:e:step]
|
|
if d:
|
|
data[start] = d
|
|
if len(data) == 0: # Fully undefined sequence
|
|
return _UndefinedSequenceData(size)
|
|
# merge adjacent sequence segments
|
|
end = -1
|
|
previous = 0 # not needed here, but it keeps flake happy
|
|
items = data.items()
|
|
data = {}
|
|
for start, seq in items:
|
|
if end == start:
|
|
data[previous] += seq
|
|
else:
|
|
data[start] = seq
|
|
previous = start
|
|
end = start + len(seq)
|
|
if len(data) == 1:
|
|
seq = data.get(0)
|
|
if seq is not None and len(seq) == size:
|
|
return seq # Fully defined sequence; return bytes
|
|
if step < 0:
|
|
# use this after we drop Python 3.7:
|
|
# data = {start: data[start] for start in reversed(data)}
|
|
# use this as long as we support Python 3.7:
|
|
data = {start: data[start] for start in reversed(list(data.keys()))}
|
|
return _PartiallyDefinedSequenceData(size, data)
|
|
elif self._length <= key:
|
|
raise IndexError("sequence index out of range")
|
|
else:
|
|
for start, seq in self._data.items():
|
|
if start <= key and key < start + len(seq):
|
|
return seq[key - start]
|
|
raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
|
|
|
|
def __len__(self):
|
|
return self._length
|
|
|
|
def __bytes__(self):
|
|
raise UndefinedSequenceError("Sequence content is only partially defined")
|
|
|
|
def __add__(self, other):
|
|
length = len(self) + len(other)
|
|
data = dict(self._data)
|
|
items = list(self._data.items())
|
|
start, seq = items[-1]
|
|
end = start + len(seq)
|
|
try:
|
|
other = bytes(other)
|
|
except UndefinedSequenceError:
|
|
if isinstance(other, _UndefinedSequenceData):
|
|
pass
|
|
elif isinstance(other, _PartiallyDefinedSequenceData):
|
|
other_items = list(other._data.items())
|
|
if end == len(self):
|
|
other_start, other_seq = other_items.pop(0)
|
|
if other_start == 0:
|
|
data[start] += other_seq
|
|
else:
|
|
data[len(self) + other_start] = other_seq
|
|
for other_start, other_seq in other_items:
|
|
data[len(self) + other_start] = other_seq
|
|
else:
|
|
if end == len(self):
|
|
data[start] += other
|
|
else:
|
|
data[len(self)] = other
|
|
return _PartiallyDefinedSequenceData(length, data)
|
|
|
|
def __radd__(self, other):
|
|
length = len(other) + len(self)
|
|
try:
|
|
other = bytes(other)
|
|
except UndefinedSequenceError:
|
|
data = {len(other) + start: seq for start, seq in self._data.items()}
|
|
else:
|
|
data = {0: other}
|
|
items = list(self._data.items())
|
|
start, seq = items.pop(0)
|
|
if start == 0:
|
|
data[0] += seq
|
|
else:
|
|
data[len(other) + start] = seq
|
|
for start, seq in items:
|
|
data[len(other) + start] = seq
|
|
return _PartiallyDefinedSequenceData(length, data)
|
|
|
|
def __mul__(self, other):
|
|
length = self._length
|
|
items = self._data.items()
|
|
data = {}
|
|
end = -1
|
|
previous = 0 # not needed here, but it keeps flake happy
|
|
for i in range(other):
|
|
for start, seq in items:
|
|
start += i * length
|
|
if end == start:
|
|
data[previous] += seq
|
|
else:
|
|
data[start] = seq
|
|
previous = start
|
|
end = start + len(seq)
|
|
return _PartiallyDefinedSequenceData(length * other, data)
|
|
|
|
def upper(self):
|
|
"""Return an upper case copy of the sequence."""
|
|
data = {start: seq.upper() for start, seq in self._data.items()}
|
|
return _PartiallyDefinedSequenceData(self._length, data)
|
|
|
|
def lower(self):
|
|
"""Return a lower case copy of the sequence."""
|
|
data = {start: seq.lower() for start, seq in self._data.items()}
|
|
return _PartiallyDefinedSequenceData(self._length, data)
|
|
|
|
def isupper(self):
|
|
"""Return True if all ASCII characters in data are uppercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
# Character case is irrelevant for an undefined sequence
|
|
raise UndefinedSequenceError("Sequence content is only partially defined")
|
|
|
|
def islower(self):
|
|
"""Return True if all ASCII characters in data are lowercase.
|
|
|
|
If there are no cased characters, the method returns False.
|
|
"""
|
|
# Character case is irrelevant for an undefined sequence
|
|
raise UndefinedSequenceError("Sequence content is only partially defined")
|
|
|
|
def translate(self, table, delete=b""):
|
|
"""Return a copy with each character mapped by the given translation table.
|
|
|
|
table
|
|
Translation table, which must be a bytes object of length 256.
|
|
|
|
All characters occurring in the optional argument delete are removed.
|
|
The remaining characters are mapped through the given translation table.
|
|
"""
|
|
items = self._data.items()
|
|
data = {start: seq.translate(table, delete) for start, seq in items}
|
|
return _PartiallyDefinedSequenceData(self._length, data)
|
|
|
|
def replace(self, old, new):
|
|
"""Return a copy with all occurrences of substring old replaced by new."""
|
|
# Replacing substring old by new in the undefined sequence segments
|
|
# will result in an undefined sequence segment of the same length, if
|
|
# old and new have the number of characters. If not, an error is raised,
|
|
# as the correct start positions cannot be calculated reliably.
|
|
if len(old) != len(new):
|
|
raise UndefinedSequenceError(
|
|
"Sequence content is only partially defined; substring \n"
|
|
"replacement cannot be performed reliably"
|
|
)
|
|
items = self._data.items()
|
|
data = {start: seq.replace(old, new) for start, seq in items}
|
|
return _PartiallyDefinedSequenceData(self._length, data)
|
|
|
|
@property
|
|
def defined(self):
|
|
"""Return False, as the sequence is not fully defined and has a non-zero length."""
|
|
return False
|
|
|
|
@property
|
|
def defined_ranges(self):
|
|
"""Return a tuple of the ranges where the sequence contents is defined.
|
|
|
|
The return value has the format ((start1, end1), (start2, end2), ...).
|
|
"""
|
|
return tuple((start, start + len(seq)) for start, seq in self._data.items())
|
|
|
|
|
|
# The transcribe, backward_transcribe, and translate functions are
|
|
# user-friendly versions of the corresponding Seq/MutableSeq methods.
|
|
# The functions work both on Seq objects, and on strings.
|
|
|
|
|
|
def transcribe(dna):
|
|
"""Transcribe a DNA sequence into RNA.
|
|
|
|
Following the usual convention, the sequence is interpreted as the
|
|
coding strand of the DNA double helix, not the template strand. This
|
|
means we can get the RNA sequence just by switching T to U.
|
|
|
|
If given a string, returns a new string object.
|
|
|
|
Given a Seq or MutableSeq, returns a new Seq object.
|
|
|
|
e.g.
|
|
|
|
>>> transcribe("ACTGN")
|
|
'ACUGN'
|
|
"""
|
|
if isinstance(dna, Seq):
|
|
return dna.transcribe()
|
|
elif isinstance(dna, MutableSeq):
|
|
return Seq(dna).transcribe()
|
|
else:
|
|
return dna.replace("T", "U").replace("t", "u")
|
|
|
|
|
|
def back_transcribe(rna):
|
|
"""Return the RNA sequence back-transcribed into DNA.
|
|
|
|
If given a string, returns a new string object.
|
|
|
|
Given a Seq or MutableSeq, returns a new Seq object.
|
|
|
|
e.g.
|
|
|
|
>>> back_transcribe("ACUGN")
|
|
'ACTGN'
|
|
"""
|
|
if isinstance(rna, Seq):
|
|
return rna.back_transcribe()
|
|
elif isinstance(rna, MutableSeq):
|
|
return Seq(rna).back_transcribe()
|
|
else:
|
|
return rna.replace("U", "T").replace("u", "t")
|
|
|
|
|
|
def _translate_str(
|
|
sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
|
|
):
|
|
"""Translate nucleotide string into a protein string (PRIVATE).
|
|
|
|
Arguments:
|
|
- sequence - a string
|
|
- table - Which codon table to use? This can be either a name (string),
|
|
an NCBI identifier (integer), or a CodonTable object (useful for
|
|
non-standard genetic codes). This defaults to the "Standard" table.
|
|
- stop_symbol - a single character string, what to use for terminators.
|
|
- to_stop - boolean, should translation terminate at the first
|
|
in frame stop codon? If there is no in-frame stop codon
|
|
then translation continues to the end.
|
|
- pos_stop - a single character string for a possible stop codon
|
|
(e.g. TAN or NNN)
|
|
- cds - Boolean, indicates this is a complete CDS. If True, this
|
|
checks the sequence starts with a valid alternative start
|
|
codon (which will be translated as methionine, M), that the
|
|
sequence length is a multiple of three, and that there is a
|
|
single in frame stop codon at the end (this will be excluded
|
|
from the protein sequence, regardless of the to_stop option).
|
|
If these tests fail, an exception is raised.
|
|
- gap - Single character string to denote symbol used for gaps.
|
|
Defaults to None.
|
|
|
|
Returns a string.
|
|
|
|
e.g.
|
|
|
|
>>> from Bio.Data import CodonTable
|
|
>>> table = CodonTable.ambiguous_dna_by_id[1]
|
|
>>> _translate_str("AAA", table)
|
|
'K'
|
|
>>> _translate_str("TAR", table)
|
|
'*'
|
|
>>> _translate_str("TAN", table)
|
|
'X'
|
|
>>> _translate_str("TAN", table, pos_stop="@")
|
|
'@'
|
|
>>> _translate_str("TA?", table)
|
|
Traceback (most recent call last):
|
|
...
|
|
Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
|
|
|
|
In a change to older versions of Biopython, partial codons are now
|
|
always regarded as an error (previously only checked if cds=True)
|
|
and will trigger a warning (likely to become an exception in a
|
|
future release).
|
|
|
|
If **cds=True**, the start and stop codons are checked, and the start
|
|
codon will be translated at methionine. The sequence must be an
|
|
while number of codons.
|
|
|
|
>>> _translate_str("ATGCCCTAG", table, cds=True)
|
|
'MP'
|
|
>>> _translate_str("AAACCCTAG", table, cds=True)
|
|
Traceback (most recent call last):
|
|
...
|
|
Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
|
|
>>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
|
|
Traceback (most recent call last):
|
|
...
|
|
Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
|
|
"""
|
|
try:
|
|
table_id = int(table)
|
|
except ValueError:
|
|
# Assume it's a table name
|
|
# The same table can be used for RNA or DNA
|
|
try:
|
|
codon_table = CodonTable.ambiguous_generic_by_name[table]
|
|
except KeyError:
|
|
if isinstance(table, str):
|
|
raise ValueError(
|
|
"The Bio.Seq translate methods and function DO NOT "
|
|
"take a character string mapping table like the python "
|
|
"string object's translate method. "
|
|
"Use str(my_seq).translate(...) instead."
|
|
) from None
|
|
else:
|
|
raise TypeError("table argument must be integer or string") from None
|
|
except (AttributeError, TypeError):
|
|
# Assume it's a CodonTable object
|
|
if isinstance(table, CodonTable.CodonTable):
|
|
codon_table = table
|
|
else:
|
|
raise ValueError("Bad table argument") from None
|
|
else:
|
|
# Assume it's a table ID
|
|
# The same table can be used for RNA or DNA
|
|
codon_table = CodonTable.ambiguous_generic_by_id[table_id]
|
|
sequence = sequence.upper()
|
|
amino_acids = []
|
|
forward_table = codon_table.forward_table
|
|
stop_codons = codon_table.stop_codons
|
|
if codon_table.nucleotide_alphabet is not None:
|
|
valid_letters = set(codon_table.nucleotide_alphabet.upper())
|
|
else:
|
|
# Assume the worst case, ambiguous DNA or RNA:
|
|
valid_letters = set(
|
|
IUPACData.ambiguous_dna_letters.upper()
|
|
+ IUPACData.ambiguous_rna_letters.upper()
|
|
)
|
|
n = len(sequence)
|
|
|
|
# Check for tables with 'ambiguous' (dual-coding) stop codons:
|
|
dual_coding = [c for c in stop_codons if c in forward_table]
|
|
if dual_coding:
|
|
c = dual_coding[0]
|
|
if to_stop:
|
|
raise ValueError(
|
|
"You cannot use 'to_stop=True' with this table as it contains"
|
|
f" {len(dual_coding)} codon(s) which can be both STOP and an"
|
|
f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
|
|
)
|
|
warnings.warn(
|
|
f"This table contains {len(dual_coding)} codon(s) which code(s) for"
|
|
f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
|
|
" or STOP). Such codons will be translated as amino acid.",
|
|
BiopythonWarning,
|
|
)
|
|
|
|
if cds:
|
|
if str(sequence[:3]).upper() not in codon_table.start_codons:
|
|
raise CodonTable.TranslationError(
|
|
f"First codon '{sequence[:3]}' is not a start codon"
|
|
)
|
|
if n % 3 != 0:
|
|
raise CodonTable.TranslationError(
|
|
f"Sequence length {n} is not a multiple of three"
|
|
)
|
|
if str(sequence[-3:]).upper() not in stop_codons:
|
|
raise CodonTable.TranslationError(
|
|
f"Final codon '{sequence[-3:]}' is not a stop codon"
|
|
)
|
|
# Don't translate the stop symbol, and manually translate the M
|
|
sequence = sequence[3:-3]
|
|
n -= 6
|
|
amino_acids = ["M"]
|
|
elif n % 3 != 0:
|
|
warnings.warn(
|
|
"Partial codon, len(sequence) not a multiple of three. "
|
|
"Explicitly trim the sequence or add trailing N before "
|
|
"translation. This may become an error in future.",
|
|
BiopythonWarning,
|
|
)
|
|
if gap is not None:
|
|
if not isinstance(gap, str):
|
|
raise TypeError("Gap character should be a single character string.")
|
|
elif len(gap) > 1:
|
|
raise ValueError("Gap character should be a single character string.")
|
|
|
|
for i in range(0, n - n % 3, 3):
|
|
codon = sequence[i : i + 3]
|
|
try:
|
|
amino_acids.append(forward_table[codon])
|
|
except (KeyError, CodonTable.TranslationError):
|
|
if codon in codon_table.stop_codons:
|
|
if cds:
|
|
raise CodonTable.TranslationError(
|
|
f"Extra in frame stop codon '{codon}' found."
|
|
) from None
|
|
if to_stop:
|
|
break
|
|
amino_acids.append(stop_symbol)
|
|
elif valid_letters.issuperset(set(codon)):
|
|
# Possible stop codon (e.g. NNN or TAN)
|
|
amino_acids.append(pos_stop)
|
|
elif gap is not None and codon == gap * 3:
|
|
# Gapped translation
|
|
amino_acids.append(gap)
|
|
else:
|
|
raise CodonTable.TranslationError(
|
|
f"Codon '{codon}' is invalid"
|
|
) from None
|
|
return "".join(amino_acids)
|
|
|
|
|
|
def translate(
|
|
sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
|
|
):
|
|
"""Translate a nucleotide sequence into amino acids.
|
|
|
|
If given a string, returns a new string object. Given a Seq or
|
|
MutableSeq, returns a Seq object.
|
|
|
|
Arguments:
|
|
- table - Which codon table to use? This can be either a name
|
|
(string), an NCBI identifier (integer), or a CodonTable object
|
|
(useful for non-standard genetic codes). Defaults to the "Standard"
|
|
table.
|
|
- stop_symbol - Single character string, what to use for any
|
|
terminators, defaults to the asterisk, "*".
|
|
- to_stop - Boolean, defaults to False meaning do a full
|
|
translation continuing on past any stop codons
|
|
(translated as the specified stop_symbol). If
|
|
True, translation is terminated at the first in
|
|
frame stop codon (and the stop_symbol is not
|
|
appended to the returned protein sequence).
|
|
- cds - Boolean, indicates this is a complete CDS. If True, this
|
|
checks the sequence starts with a valid alternative start
|
|
codon (which will be translated as methionine, M), that the
|
|
sequence length is a multiple of three, and that there is a
|
|
single in frame stop codon at the end (this will be excluded
|
|
from the protein sequence, regardless of the to_stop option).
|
|
If these tests fail, an exception is raised.
|
|
- gap - Single character string to denote symbol used for gaps.
|
|
Defaults to None.
|
|
|
|
A simple string example using the default (standard) genetic code:
|
|
|
|
>>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
|
|
>>> translate(coding_dna)
|
|
'VAIVMGR*KGAR*'
|
|
>>> translate(coding_dna, stop_symbol="@")
|
|
'VAIVMGR@KGAR@'
|
|
>>> translate(coding_dna, to_stop=True)
|
|
'VAIVMGR'
|
|
|
|
Now using NCBI table 2, where TGA is not a stop codon:
|
|
|
|
>>> translate(coding_dna, table=2)
|
|
'VAIVMGRWKGAR*'
|
|
>>> translate(coding_dna, table=2, to_stop=True)
|
|
'VAIVMGRWKGAR'
|
|
|
|
In fact this example uses an alternative start codon valid under NCBI
|
|
table 2, GTG, which means this example is a complete valid CDS which
|
|
when translated should really start with methionine (not valine):
|
|
|
|
>>> translate(coding_dna, table=2, cds=True)
|
|
'MAIVMGRWKGAR'
|
|
|
|
Note that if the sequence has no in-frame stop codon, then the to_stop
|
|
argument has no effect:
|
|
|
|
>>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
|
|
>>> translate(coding_dna2)
|
|
'VAIVMGR'
|
|
>>> translate(coding_dna2, to_stop=True)
|
|
'VAIVMGR'
|
|
|
|
NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
|
|
or a stop codon. These are translated as "X". Any invalid codon
|
|
(e.g. "TA?" or "T-A") will throw a TranslationError.
|
|
|
|
It will however translate either DNA or RNA.
|
|
|
|
NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
|
|
stop codons'. These are stop codons with unambiguous sequence but which
|
|
have a context dependent coding as STOP or as amino acid. With these tables
|
|
'to_stop' must be False (otherwise a ValueError is raised). The dual
|
|
coding codons will always be translated as amino acid, except for
|
|
'cds=True', where the last codon will be translated as STOP.
|
|
|
|
>>> coding_dna3 = "ATGGCACGGAAGTGA"
|
|
>>> translate(coding_dna3)
|
|
'MARK*'
|
|
|
|
>>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W
|
|
'MARKW'
|
|
|
|
It will however raise a BiopythonWarning (not shown).
|
|
|
|
>>> translate(coding_dna3, table=27, cds=True)
|
|
'MARK'
|
|
|
|
>>> translate(coding_dna3, table=27, to_stop=True)
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: You cannot use 'to_stop=True' with this table ...
|
|
"""
|
|
if isinstance(sequence, Seq):
|
|
return sequence.translate(table, stop_symbol, to_stop, cds)
|
|
elif isinstance(sequence, MutableSeq):
|
|
# Return a Seq object
|
|
return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
|
|
else:
|
|
# Assume it's a string, return a string
|
|
return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
|
|
|
|
|
|
def reverse_complement(sequence, inplace=False):
|
|
"""Return the reverse complement as a DNA sequence.
|
|
|
|
If given a string, returns a new string object.
|
|
Given a Seq object, returns a new Seq object.
|
|
Given a MutableSeq, returns a new MutableSeq object.
|
|
Given a SeqRecord object, returns a new SeqRecord object.
|
|
|
|
>>> my_seq = "CGA"
|
|
>>> reverse_complement(my_seq)
|
|
'TCG'
|
|
>>> my_seq = Seq("CGA")
|
|
>>> reverse_complement(my_seq)
|
|
Seq('TCG')
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> reverse_complement(my_seq)
|
|
MutableSeq('TCG')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
Any U in the sequence is treated as a T:
|
|
|
|
>>> reverse_complement(Seq("CGAUT"))
|
|
Seq('AATCG')
|
|
|
|
In contrast, ``reverse_complement_rna`` returns an RNA sequence:
|
|
|
|
>>> reverse_complement_rna(Seq("CGAUT"))
|
|
Seq('AAUCG')
|
|
|
|
Supports and lower- and upper-case characters, and unambiguous and
|
|
ambiguous nucleotides. All other characters are not converted:
|
|
|
|
>>> reverse_complement("ACGTUacgtuXYZxyz")
|
|
'zrxZRXaacgtAACGT'
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> reverse_complement(my_seq, inplace=True)
|
|
MutableSeq('TCG')
|
|
>>> my_seq
|
|
MutableSeq('TCG')
|
|
|
|
As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
|
raised if ``reverse_complement`` is called on a ``Seq`` object with
|
|
``inplace=True``.
|
|
"""
|
|
from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
|
|
|
if isinstance(sequence, (Seq, MutableSeq)):
|
|
return sequence.reverse_complement(inplace)
|
|
if isinstance(sequence, SeqRecord):
|
|
if inplace:
|
|
raise TypeError("SeqRecords are immutable")
|
|
return sequence.reverse_complement()
|
|
# Assume it's a string.
|
|
if inplace:
|
|
raise TypeError("strings are immutable")
|
|
sequence = sequence.encode("ASCII")
|
|
sequence = sequence.translate(_dna_complement_table)
|
|
sequence = sequence.decode("ASCII")
|
|
return sequence[::-1]
|
|
|
|
|
|
def reverse_complement_rna(sequence, inplace=False):
|
|
"""Return the reverse complement as an RNA sequence.
|
|
|
|
If given a string, returns a new string object.
|
|
Given a Seq object, returns a new Seq object.
|
|
Given a MutableSeq, returns a new MutableSeq object.
|
|
Given a SeqRecord object, returns a new SeqRecord object.
|
|
|
|
>>> my_seq = "CGA"
|
|
>>> reverse_complement_rna(my_seq)
|
|
'UCG'
|
|
>>> my_seq = Seq("CGA")
|
|
>>> reverse_complement_rna(my_seq)
|
|
Seq('UCG')
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> reverse_complement_rna(my_seq)
|
|
MutableSeq('UCG')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
Any T in the sequence is treated as a U:
|
|
|
|
>>> reverse_complement_rna(Seq("CGAUT"))
|
|
Seq('AAUCG')
|
|
|
|
In contrast, ``reverse_complement`` returns a DNA sequence:
|
|
|
|
>>> reverse_complement(Seq("CGAUT"), inplace=False)
|
|
Seq('AATCG')
|
|
|
|
Supports and lower- and upper-case characters, and unambiguous and
|
|
ambiguous nucleotides. All other characters are not converted:
|
|
|
|
>>> reverse_complement_rna("ACGTUacgtuXYZxyz")
|
|
'zrxZRXaacguAACGU'
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> reverse_complement_rna(my_seq, inplace=True)
|
|
MutableSeq('UCG')
|
|
>>> my_seq
|
|
MutableSeq('UCG')
|
|
|
|
As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
|
raised if ``reverse_complement`` is called on a ``Seq`` object with
|
|
``inplace=True``.
|
|
"""
|
|
from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
|
|
|
if isinstance(sequence, (Seq, MutableSeq)):
|
|
return sequence.reverse_complement_rna(inplace)
|
|
if isinstance(sequence, SeqRecord):
|
|
if inplace:
|
|
raise TypeError("SeqRecords are immutable")
|
|
return sequence.reverse_complement_rna()
|
|
# Assume it's a string.
|
|
if inplace:
|
|
raise TypeError("strings are immutable")
|
|
sequence = sequence.encode("ASCII")
|
|
sequence = sequence.translate(_rna_complement_table)
|
|
sequence = sequence.decode("ASCII")
|
|
return sequence[::-1]
|
|
|
|
|
|
def complement(sequence, inplace=False):
|
|
"""Return the complement as a DNA sequence.
|
|
|
|
If given a string, returns a new string object.
|
|
Given a Seq object, returns a new Seq object.
|
|
Given a MutableSeq, returns a new MutableSeq object.
|
|
Given a SeqRecord object, returns a new SeqRecord object.
|
|
|
|
>>> my_seq = "CGA"
|
|
>>> complement(my_seq)
|
|
'GCT'
|
|
>>> my_seq = Seq("CGA")
|
|
>>> complement(my_seq)
|
|
Seq('GCT')
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> complement(my_seq)
|
|
MutableSeq('GCT')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
Any U in the sequence is treated as a T:
|
|
|
|
>>> complement(Seq("CGAUT"))
|
|
Seq('GCTAA')
|
|
|
|
In contrast, ``complement_rna`` returns an RNA sequence:
|
|
|
|
>>> complement_rna(Seq("CGAUT"))
|
|
Seq('GCUAA')
|
|
|
|
Supports and lower- and upper-case characters, and unambiguous and
|
|
ambiguous nucleotides. All other characters are not converted:
|
|
|
|
>>> complement("ACGTUacgtuXYZxyz")
|
|
'TGCAAtgcaaXRZxrz'
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> complement(my_seq, inplace=True)
|
|
MutableSeq('GCT')
|
|
>>> my_seq
|
|
MutableSeq('GCT')
|
|
|
|
As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
|
raised if ``reverse_complement`` is called on a ``Seq`` object with
|
|
``inplace=True``.
|
|
"""
|
|
from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
|
|
|
if isinstance(sequence, (Seq, MutableSeq)):
|
|
return sequence.complement(inplace)
|
|
if isinstance(sequence, SeqRecord):
|
|
if inplace:
|
|
raise TypeError("SeqRecords are immutable")
|
|
return sequence.complement()
|
|
# Assume it's a string.
|
|
if inplace is True:
|
|
raise TypeError("strings are immutable")
|
|
sequence = sequence.encode("ASCII")
|
|
sequence = sequence.translate(_dna_complement_table)
|
|
return sequence.decode("ASCII")
|
|
|
|
|
|
def complement_rna(sequence, inplace=False):
|
|
"""Return the complement as an RNA sequence.
|
|
|
|
If given a string, returns a new string object.
|
|
Given a Seq object, returns a new Seq object.
|
|
Given a MutableSeq, returns a new MutableSeq object.
|
|
Given a SeqRecord object, returns a new SeqRecord object.
|
|
|
|
>>> my_seq = "CGA"
|
|
>>> complement_rna(my_seq)
|
|
'GCU'
|
|
>>> my_seq = Seq("CGA")
|
|
>>> complement_rna(my_seq)
|
|
Seq('GCU')
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> complement_rna(my_seq)
|
|
MutableSeq('GCU')
|
|
>>> my_seq
|
|
MutableSeq('CGA')
|
|
|
|
Any T in the sequence is treated as a U:
|
|
|
|
>>> complement_rna(Seq("CGAUT"))
|
|
Seq('GCUAA')
|
|
|
|
In contrast, ``complement`` returns a DNA sequence:
|
|
|
|
>>> complement(Seq("CGAUT"))
|
|
Seq('GCTAA')
|
|
|
|
Supports and lower- and upper-case characters, and unambiguous and
|
|
ambiguous nucleotides. All other characters are not converted:
|
|
|
|
>>> complement_rna("ACGTUacgtuXYZxyz")
|
|
'UGCAAugcaaXRZxrz'
|
|
|
|
The sequence is modified in-place and returned if inplace is True:
|
|
|
|
>>> my_seq = MutableSeq("CGA")
|
|
>>> complement(my_seq, inplace=True)
|
|
MutableSeq('GCT')
|
|
>>> my_seq
|
|
MutableSeq('GCT')
|
|
|
|
As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
|
raised if ``reverse_complement`` is called on a ``Seq`` object with
|
|
``inplace=True``.
|
|
"""
|
|
from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
|
|
|
if isinstance(sequence, (Seq, MutableSeq)):
|
|
return sequence.complement_rna(inplace)
|
|
if isinstance(sequence, SeqRecord):
|
|
if inplace:
|
|
raise TypeError("SeqRecords are immutable")
|
|
return sequence.complement_rna()
|
|
# Assume it's a string.
|
|
if inplace:
|
|
raise TypeError("strings are immutable")
|
|
sequence = sequence.encode("ASCII")
|
|
sequence = sequence.translate(_rna_complement_table)
|
|
return sequence.decode("ASCII")
|
|
|
|
|
|
def _test():
|
|
"""Run the Bio.Seq module's doctests (PRIVATE)."""
|
|
print("Running doctests...")
|
|
import doctest
|
|
|
|
doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
|
|
print("Done")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_test()
|