mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
488 lines
16 KiB
Python
488 lines
16 KiB
Python
#!/usr/bin/env python
|
|
#
|
|
# Restriction Analysis Libraries.
|
|
# Copyright (C) 2004. Frederic Sohm.
|
|
#
|
|
# This code is part of the Biopython distribution and governed by its
|
|
# license. Please see the LICENSE file that should have been included
|
|
# as part of this package.
|
|
#
|
|
r"""Print the results of restriction enzyme analysis.
|
|
|
|
PrintFormat prints the results from restriction analysis in 3 different
|
|
format: list, column or map.
|
|
|
|
The easiest way to use it is:
|
|
|
|
>>> from Bio.Restriction.PrintFormat import PrintFormat
|
|
>>> from Bio.Restriction.Restriction import RestrictionBatch
|
|
>>> from Bio.Seq import Seq
|
|
>>> pBs_mcs = Seq('GGTACCGGGCCCCCCCTCGAGGTCGACGGTATCGATAAGCTTGATATCGAATTC')
|
|
>>> restriction_batch = RestrictionBatch(['EcoRI', 'BamHI', 'ApaI'])
|
|
>>> result = restriction_batch.search(pBs_mcs)
|
|
>>> my_map = PrintFormat()
|
|
>>> my_map.print_that(result, 'My pBluescript mcs analysis:\n',
|
|
... 'No site:\n')
|
|
My pBluescript mcs analysis:
|
|
ApaI : 12.
|
|
EcoRI : 50.
|
|
No site:
|
|
BamHI
|
|
<BLANKLINE>
|
|
>>> my_map.sequence = pBs_mcs
|
|
>>> my_map.print_as("map")
|
|
>>> my_map.print_that(result)
|
|
12 ApaI
|
|
|
|
|
| 50 EcoRI
|
|
| |
|
|
GGTACCGGGCCCCCCCTCGAGGTCGACGGTATCGATAAGCTTGATATCGAATTC
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
CCATGGCCCGGGGGGGAGCTCCAGCTGCCATAGCTATTCGAACTATAGCTTAAG
|
|
1 54
|
|
<BLANKLINE>
|
|
<BLANKLINE>
|
|
Enzymes which do not cut the sequence.
|
|
<BLANKLINE>
|
|
BamHI
|
|
<BLANKLINE>
|
|
>>>
|
|
|
|
Some of the methods of PrintFormat are meant to be overridden by derived
|
|
class.
|
|
|
|
Use the following parameters to control the appearance:
|
|
|
|
- ConsoleWidth : width of the console used default to 80.
|
|
should never be less than 60.
|
|
- NameWidth : space attributed to the name in PrintList method.
|
|
- Indent : Indent of the second line.
|
|
- MaxSize : Maximal size of the sequence (default=6:
|
|
-> 99 999 bp + 1 trailing ','
|
|
people are unlikely to ask for restriction map of sequences
|
|
bigger than 100.000 bp. This is needed to determine the
|
|
space to be reserved for sites location.
|
|
|
|
- MaxSize = 5 => 9.999 bp
|
|
- MaxSize = 6 => 99.999 bp
|
|
- MaxSize = 7 => 999.999 bp
|
|
|
|
Example output::
|
|
|
|
<------------ ConsoleWidth --------------->
|
|
<- NameWidth ->
|
|
EcoRI : 1, 45, 50, 300, 400, 650,
|
|
700, 1200, 2500.
|
|
<-->
|
|
Indent
|
|
|
|
""" # noqa: W291
|
|
|
|
|
|
import re
|
|
|
|
|
|
class PrintFormat:
|
|
"""PrintFormat allow the printing of results of restriction analysis."""
|
|
|
|
ConsoleWidth = 80
|
|
NameWidth = 10
|
|
MaxSize = 6
|
|
Cmodulo = ConsoleWidth % NameWidth
|
|
PrefWidth = ConsoleWidth - Cmodulo
|
|
Indent = 4
|
|
linesize = PrefWidth - NameWidth
|
|
|
|
def print_as(self, what="list"):
|
|
"""Print the results as specified.
|
|
|
|
Valid format are:
|
|
'list' -> alphabetical order
|
|
'number' -> number of sites in the sequence
|
|
'map' -> a map representation of the sequence with the sites.
|
|
|
|
If you want more flexibility over-ride the virtual method make_format.
|
|
"""
|
|
if what == "map":
|
|
self.make_format = self._make_map
|
|
elif what == "number":
|
|
self.make_format = self._make_number
|
|
else:
|
|
self.make_format = self._make_list
|
|
|
|
def format_output(self, dct, title="", s1=""):
|
|
"""Summarise results as a nicely formatted string.
|
|
|
|
Arguments:
|
|
- dct is a dictionary as returned by a RestrictionBatch.search()
|
|
- title is the title of the map.
|
|
It must be a formatted string, i.e. you must include the line break.
|
|
- s1 is the title separating the list of enzymes that have sites from
|
|
those without sites.
|
|
- s1 must be a formatted string as well.
|
|
|
|
The format of print_that is a list.
|
|
"""
|
|
if not dct:
|
|
dct = self.results
|
|
ls, nc = [], []
|
|
for k, v in dct.items():
|
|
if v:
|
|
ls.append((k, v))
|
|
else:
|
|
nc.append(k)
|
|
return self.make_format(ls, title, nc, s1)
|
|
|
|
def print_that(self, dct, title="", s1=""):
|
|
"""Print the output of the format_output method (OBSOLETE).
|
|
|
|
Arguments:
|
|
- dct is a dictionary as returned by a RestrictionBatch.search()
|
|
- title is the title of the map.
|
|
It must be a formatted string, i.e. you must include the line break.
|
|
- s1 is the title separating the list of enzymes that have sites from
|
|
those without sites.
|
|
- s1 must be a formatted string as well.
|
|
|
|
This method prints the output of A.format_output() and it is here
|
|
for backwards compatibility.
|
|
"""
|
|
print(self.format_output(dct, title, s1))
|
|
|
|
def make_format(self, cut=(), title="", nc=(), s1=""):
|
|
"""Virtual method used for formatting results.
|
|
|
|
Virtual method.
|
|
Here to be pointed to one of the _make_* methods.
|
|
You can as well create a new method and point make_format to it.
|
|
"""
|
|
return self._make_list(cut, title, nc, s1)
|
|
|
|
# _make_* methods to be used with the virtual method make_format
|
|
|
|
def _make_list(self, ls, title, nc, s1):
|
|
"""Summarise a list of positions by enzyme (PRIVATE).
|
|
|
|
Return a string of form::
|
|
|
|
title.
|
|
|
|
enzyme1 : position1, position2.
|
|
enzyme2 : position1, position2, position3.
|
|
|
|
Arguments:
|
|
- ls is a tuple or list of cutting enzymes.
|
|
- title is the title.
|
|
- nc is a tuple or list of non cutting enzymes.
|
|
- s1 is the sentence before the non cutting enzymes.
|
|
"""
|
|
return self._make_list_only(ls, title) + self._make_nocut_only(nc, s1)
|
|
|
|
def _make_map(self, ls, title, nc, s1):
|
|
"""Summarise mapping information as a string (PRIVATE).
|
|
|
|
Return a string of form::
|
|
|
|
| title.
|
|
|
|
|
| enzyme1, position
|
|
| |
|
|
| AAAAAAAAAAAAAAAAAAAAA...
|
|
| |||||||||||||||||||||
|
|
| TTTTTTTTTTTTTTTTTTTTT...
|
|
|
|
Arguments:
|
|
- ls is a list of cutting enzymes.
|
|
- title is the title.
|
|
- nc is a list of non cutting enzymes.
|
|
- s1 is the sentence before the non cutting enzymes.
|
|
"""
|
|
return self._make_map_only(ls, title) + self._make_nocut_only(nc, s1)
|
|
|
|
def _make_number(self, ls, title, nc, s1):
|
|
"""Format cutting position information as a string (PRIVATE).
|
|
|
|
Returns a string in the form::
|
|
|
|
title.
|
|
|
|
enzyme which cut 1 time:
|
|
|
|
enzyme1 : position1.
|
|
|
|
enzyme which cut 2 times:
|
|
|
|
enzyme2 : position1, position2.
|
|
...
|
|
|
|
Arguments:
|
|
- ls is a list of cutting enzymes.
|
|
- title is the title.
|
|
- nc is a list of non cutting enzymes.
|
|
- s1 is the sentence before the non cutting enzymes.
|
|
"""
|
|
return self._make_number_only(ls, title) + self._make_nocut_only(nc, s1)
|
|
|
|
def _make_nocut(self, ls, title, nc, s1):
|
|
"""Summarise non-cutting enzymes (PRIVATE).
|
|
|
|
Return a formatted string of the non cutting enzymes.
|
|
|
|
ls is a list of cutting enzymes -> will not be used.
|
|
Here for compatibility with make_format.
|
|
|
|
Arguments:
|
|
- title is the title.
|
|
- nc is a list of non cutting enzymes.
|
|
- s1 is the sentence before the non cutting enzymes.
|
|
"""
|
|
return title + self._make_nocut_only(nc, s1)
|
|
|
|
def _make_nocut_only(self, nc, s1, ls=(), title=""):
|
|
"""Summarise non-cutting enzymes (PRIVATE).
|
|
|
|
Return a formatted string of the non cutting enzymes.
|
|
|
|
Arguments:
|
|
- nc is a tuple or list of non cutting enzymes.
|
|
- s1 is the sentence before the non cutting enzymes.
|
|
"""
|
|
if not nc:
|
|
return s1
|
|
st = ""
|
|
stringsite = s1 or "\n Enzymes which do not cut the sequence.\n\n"
|
|
Join = "".join
|
|
for key in sorted(nc):
|
|
st = Join((st, str.ljust(str(key), self.NameWidth)))
|
|
if len(st) > self.linesize:
|
|
stringsite = Join((stringsite, st, "\n"))
|
|
st = ""
|
|
stringsite = Join((stringsite, st, "\n"))
|
|
return stringsite
|
|
|
|
def _make_list_only(self, ls, title, nc=(), s1=""):
|
|
"""Summarise list of positions per enzyme (PRIVATE).
|
|
|
|
Return a string of form::
|
|
|
|
title.
|
|
|
|
enzyme1 : position1, position2.
|
|
enzyme2 : position1, position2, position3.
|
|
...
|
|
|
|
Arguments:
|
|
- ls is a tuple or list of results.
|
|
- title is a string.
|
|
- Non cutting enzymes are not included.
|
|
"""
|
|
if not ls:
|
|
return title
|
|
return self.__next_section(ls, title)
|
|
|
|
def _make_number_only(self, ls, title, nc=(), s1=""):
|
|
"""Summarise number of cuts as a string (PRIVATE).
|
|
|
|
Return a string of form::
|
|
|
|
title.
|
|
|
|
enzyme which cut 1 time:
|
|
|
|
enzyme1 : position1.
|
|
|
|
enzyme which cut 2 times:
|
|
|
|
enzyme2 : position1, position2.
|
|
...
|
|
|
|
Arguments:
|
|
- ls is a list of results.
|
|
- title is a string.
|
|
- Non cutting enzymes are not included.
|
|
"""
|
|
if not ls:
|
|
return title
|
|
ls.sort(key=lambda x: len(x[1]))
|
|
iterator = iter(ls)
|
|
cur_len = 1
|
|
new_sect = []
|
|
for name, sites in iterator:
|
|
length = len(sites)
|
|
if length > cur_len:
|
|
title += "\n\nenzymes which cut %i times :\n\n" % cur_len
|
|
title = self.__next_section(new_sect, title)
|
|
new_sect, cur_len = [(name, sites)], length
|
|
continue
|
|
new_sect.append((name, sites))
|
|
title += "\n\nenzymes which cut %i times :\n\n" % cur_len
|
|
return self.__next_section(new_sect, title)
|
|
|
|
def _make_map_only(self, ls, title, nc=(), s1=""):
|
|
"""Make string describing cutting map (PRIVATE).
|
|
|
|
Return a string of form::
|
|
|
|
| title.
|
|
|
|
|
| enzyme1, position
|
|
| |
|
|
| AAAAAAAAAAAAAAAAAAAAA...
|
|
| |||||||||||||||||||||
|
|
| TTTTTTTTTTTTTTTTTTTTT...
|
|
|
|
Arguments:
|
|
- ls is a list of results.
|
|
- title is a string.
|
|
- Non cutting enzymes are not included.
|
|
"""
|
|
if not ls:
|
|
return title
|
|
resultKeys = sorted(str(x) for x, y in ls)
|
|
map = title or ""
|
|
enzymemap = {}
|
|
for enzyme, cut in ls:
|
|
for c in cut:
|
|
if c in enzymemap:
|
|
enzymemap[c].append(str(enzyme))
|
|
else:
|
|
enzymemap[c] = [str(enzyme)]
|
|
mapping = sorted(enzymemap.keys())
|
|
cutloc = {}
|
|
x, counter, length = 0, 0, len(self.sequence)
|
|
for x in range(60, length, 60):
|
|
counter = x - 60
|
|
loc = []
|
|
cutloc[counter] = loc
|
|
remaining = []
|
|
for key in mapping:
|
|
if key <= x:
|
|
loc.append(key)
|
|
else:
|
|
remaining.append(key)
|
|
mapping = remaining
|
|
cutloc[x] = mapping
|
|
sequence = str(self.sequence)
|
|
revsequence = str(
|
|
self.sequence.complement(inplace=False)
|
|
) # TODO: remove inplace=False
|
|
a = "|"
|
|
base, counter = 0, 0
|
|
emptyline = " " * 60
|
|
Join = "".join
|
|
for base in range(60, length, 60):
|
|
counter = base - 60
|
|
line = emptyline
|
|
for key in cutloc[counter]:
|
|
s = ""
|
|
if key == base:
|
|
for n in enzymemap[key]:
|
|
s = " ".join((s, n))
|
|
chunk = line[0:59]
|
|
lineo = Join((chunk, str(key), s, "\n"))
|
|
line2 = Join((chunk, a, "\n"))
|
|
linetot = Join((lineo, line2))
|
|
map = Join((map, linetot))
|
|
break
|
|
for n in enzymemap[key]:
|
|
s = " ".join((s, n))
|
|
k = key % 60
|
|
lineo = Join((line[0 : (k - 1)], str(key), s, "\n"))
|
|
line = Join((line[0 : (k - 1)], a, line[k:]))
|
|
line2 = Join((line[0 : (k - 1)], a, line[k:], "\n"))
|
|
linetot = Join((lineo, line2))
|
|
map = Join((map, linetot))
|
|
mapunit = "\n".join(
|
|
(
|
|
sequence[counter:base],
|
|
a * 60,
|
|
revsequence[counter:base],
|
|
Join(
|
|
(
|
|
str.ljust(str(counter + 1), 15),
|
|
" " * 30,
|
|
str.rjust(str(base), 15),
|
|
"\n\n",
|
|
)
|
|
),
|
|
)
|
|
)
|
|
map = Join((map, mapunit))
|
|
line = " " * 60
|
|
for key in cutloc[base]:
|
|
s = ""
|
|
if key == length:
|
|
for n in enzymemap[key]:
|
|
s = Join((s, " ", n))
|
|
chunk = line[0 : (length - 1)]
|
|
lineo = Join((chunk, str(key), s, "\n"))
|
|
line2 = Join((chunk, a, "\n"))
|
|
linetot = Join((lineo, line2))
|
|
map = Join((map, linetot))
|
|
break
|
|
for n in enzymemap[key]:
|
|
s = Join((s, " ", n))
|
|
k = key % 60
|
|
lineo = Join((line[0 : (k - 1)], str(key), s, "\n"))
|
|
line = Join((line[0 : (k - 1)], a, line[k:]))
|
|
line2 = Join((line[0 : (k - 1)], a, line[k:], "\n"))
|
|
linetot = Join((lineo, line2))
|
|
map = Join((map, linetot))
|
|
mapunit = ""
|
|
mapunit = Join((sequence[base:length], "\n"))
|
|
mapunit = Join((mapunit, a * (length - base), "\n"))
|
|
mapunit = Join((mapunit, revsequence[base:length], "\n"))
|
|
mapunit = Join(
|
|
(
|
|
mapunit,
|
|
Join(
|
|
(
|
|
str.ljust(str(base + 1), 15),
|
|
" " * (length - base - 30),
|
|
str.rjust(str(length), 15),
|
|
"\n\n",
|
|
)
|
|
),
|
|
)
|
|
)
|
|
map = Join((map, mapunit))
|
|
return map
|
|
|
|
# private method to do lists:
|
|
|
|
def __next_section(self, ls, into):
|
|
"""Next section (PRIVATE).
|
|
|
|
Arguments:
|
|
- ls is a tuple/list of tuple (string, [int, int]).
|
|
- into is a string to which the formatted ls will be added.
|
|
|
|
Format ls as a string of lines:
|
|
The form is::
|
|
|
|
enzyme1 : position1.
|
|
enzyme2 : position2, position3.
|
|
|
|
then add the formatted ls to tot
|
|
return tot.
|
|
"""
|
|
indentation = "\n" + (self.NameWidth + self.Indent) * " "
|
|
linesize = self.linesize - self.MaxSize
|
|
pat = re.compile(r"([\w,\s()]){1,%i}[,\.]" % linesize)
|
|
several, Join = "", "".join
|
|
for name, sites in sorted(ls):
|
|
stringsite = ""
|
|
output = Join((", ".join(str(site) for site in sites), "."))
|
|
if len(output) > linesize:
|
|
#
|
|
# cut where appropriate and add the indentation
|
|
#
|
|
output = [x.group() for x in re.finditer(pat, output)]
|
|
stringsite = indentation.join(output)
|
|
else:
|
|
stringsite = output
|
|
into = Join(
|
|
(into, str(name).ljust(self.NameWidth), " : ", stringsite, "\n")
|
|
)
|
|
return into
|