mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Align: add substitution matrix support to Alignment.format() (#5049)
* Align: add optional substitution matrix/aligner argument to format() - Extended Alignment.format() to accept an optional argument: * substitution matrix (e.g. from Bio.Align.substitution_matrices) * PairwiseAligner object (uses its substitution matrix) - Preserves compatibility with existing writer-specific arguments (e.g. "vulgar" for exonerate, metadata for mauve). - Internal logic now distinguishes between substitution matrices (used in pretty-printing) and writer arguments (passed to AlignmentWriter). - All alignment format tests pass (58/58). * Align: add substitution matrix argument to helper format functions - Updated _format_pretty, _format_unicode, _format_generalized to accept optional substitution matrix argument. - Passed matrix from Alignment.format() into these helpers. - Updated docstrings accordingly. - All alignment format tests pass (58/58). * Align.format: support substitution matrices in helper functions; add tests - Updated helper functions in Align.format (generalized, pretty, html, etc.) to correctly handle substitution_matrix arguments. - Added new test file `Tests/test_format_matrix.py` to cover: * identity matches (|), positive mismatches (:), negative mismatches (.), and gaps (-) using substitution matrices. * verified behavior with both NUC.4.4 and a custom BLASTN-like matrix. - Confirmed tests pass with pytest. Resolves #5043 * Add Rachel Stern to contributors list and NEWS (first contribution) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix style issues in test_format_matrix and Align -__init__ * Rename test_format_matrix.py to test_format_matrix_unittest.py for AppVeyor test discovery * Move format matrix test to unittest style * Refactor scoring argument handling in counts() and format() - Renamed the optional argument in counts() from `argument` to `scoring` for clarity and consistency. - Updated format() to accept the `scoring` parameter via **kwargs instead of a dedicated argument. - Removed the explicit writer argument from format(), since it is no longer needed with kwargs-based handling. - Adjusted the docstring of format() accordingly to reflect the new usage. * Update test_format_matrix_unittest.py: - Use explicit assertEqual for pattern strings instead of only assertIn/NotIn, following reviewer feedback. - Simplify alignment selection by indexing [0] instead of next(iter(...)). * Remove _pattern_from_pretty, assert full alignment format instead - Replaced usage of `_pattern_from_pretty` helper with direct comparison of `aln.format("", scoring=M)` output in all relevant tests. - This makes the tests simpler and closer to the actual API usage. - Removed unused `_pattern_from_pretty` function. * docs: document new `scoring` argument in Alignment.format tutorial - Added explanation of the optional `scoring` argument to `Alignment.format()` in Doc/Tutorial/chapter_align.rst. - Included example using PairwiseAligner and a substitution matrix (NUC.4.4) to illustrate how the pattern line reflects scores. - Clarifies how symbols (|, :, ., -) correspond to matches, positive/negative mismatches, and gaps. * docs: add cont-doctest and update scoring example with '.' mismatch * docs: fix scoring example with gap penalties to show ':' and '.' in doctest * docs: add <BLANKLINE> to doctest output in scoring example * doc: add 8 in the middle line of the format * doc: Arranging the print format * Rearrange the print format --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: mdehoon <mjldehoon@yahoo.com>
This commit is contained in:
@ -2184,6 +2184,16 @@ class Alignment:
|
||||
create a human-readable representation of the alignment,
|
||||
or any of the alignment file formats supported by
|
||||
`Bio.Align` (some have not yet been implemented).
|
||||
- scoring - Optional keyword-only parameter; default=None.
|
||||
If provided, can be:
|
||||
|
||||
- A substitution matrix (typically from the
|
||||
`Bio.Align.substitution_matrices` submodule)
|
||||
used to mark positive matches (:) in the alignment string
|
||||
when two different residues have a positive score.
|
||||
|
||||
- A PairwiseAligner object, in which case its substitution
|
||||
matrix and settings are used for determining positive matches.
|
||||
|
||||
All other arguments are passed to the format-specific writer functions:
|
||||
- mask - PSL format only. Specify if repeat regions in the target
|
||||
@ -2202,8 +2212,23 @@ class Alignment:
|
||||
the alignment and include it in the output. If False
|
||||
(default), do not include the MD tag in the output.
|
||||
"""
|
||||
scoring = kwargs.pop("scoring", None)
|
||||
substitution_matrix = None
|
||||
if scoring is None and args:
|
||||
first = args[0]
|
||||
if isinstance(first, PairwiseAligner):
|
||||
substitution_matrix = first.substitution_matrix
|
||||
args = args[1:]
|
||||
elif isinstance(first, (np.ndarray, substitution_matrices.Array)):
|
||||
substitution_matrix = first
|
||||
args = args[1:]
|
||||
if substitution_matrix is None and scoring is not None:
|
||||
if isinstance(scoring, PairwiseAligner):
|
||||
substitution_matrix = scoring.substitution_matrix
|
||||
elif isinstance(scoring, (np.ndarray, substitution_matrices.Array)):
|
||||
substitution_matrix = scoring
|
||||
if fmt == "":
|
||||
return self._format_pretty()
|
||||
return self._format_pretty(substitution_matrix)
|
||||
module = _load(fmt)
|
||||
if module.AlignmentIterator.mode == "b":
|
||||
raise ValueError(f"{fmt} is a binary file format")
|
||||
@ -2215,10 +2240,17 @@ class Alignment:
|
||||
) from None
|
||||
return writer.format_alignment(self)
|
||||
|
||||
def _format_pretty(self):
|
||||
def _format_pretty(self, matrix=None):
|
||||
"""Return default string representation (PRIVATE).
|
||||
|
||||
Helper for self.format().
|
||||
|
||||
Arguments:
|
||||
- matrix - Optional; default=None
|
||||
A substitution matrix (typically from the
|
||||
`Bio.Align.substitution_matrices` submodule)
|
||||
used to mark positive matches (:) in the alignment string
|
||||
when two different residues have a positive score.
|
||||
"""
|
||||
n = len(self.sequences)
|
||||
if n == 2:
|
||||
@ -2271,7 +2303,7 @@ class Alignment:
|
||||
row[:] = end - positions
|
||||
if isinstance(seq, str):
|
||||
if not seq.isascii():
|
||||
return self._format_unicode()
|
||||
return self._format_unicode(matrix)
|
||||
elif isinstance(seq, (Seq, MutableSeq)):
|
||||
try:
|
||||
seq = bytes(seq)
|
||||
@ -2282,7 +2314,7 @@ class Alignment:
|
||||
seq = s
|
||||
seq = seq.decode()
|
||||
else:
|
||||
return self._format_generalized()
|
||||
return self._format_generalized(matrix)
|
||||
seqs.append(seq)
|
||||
minstep = steps.min(0)
|
||||
maxstep = steps.max(0)
|
||||
@ -2381,6 +2413,10 @@ class Alignment:
|
||||
c = "-"
|
||||
else:
|
||||
c = "."
|
||||
if matrix is not None and c1 != " " and c2 != " ":
|
||||
c1u, c2u = c1.upper(), c2.upper()
|
||||
if matrix[c1u, c2u] > 0:
|
||||
c = ":"
|
||||
pattern += c
|
||||
pattern_line = " %9d %s" % (position, pattern)
|
||||
pattern_lines.append(pattern_line)
|
||||
@ -2426,10 +2462,17 @@ class Alignment:
|
||||
blocks.append(block)
|
||||
return "\n".join(blocks)
|
||||
|
||||
def _format_unicode(self):
|
||||
def _format_unicode(self, matrix=None):
|
||||
"""Return default string representation (PRIVATE).
|
||||
|
||||
Helper for self.format().
|
||||
|
||||
Arguments:
|
||||
- matrix - Optional; default=None
|
||||
A substitution matrix (typically from the
|
||||
`Bio.Align.substitution_matrices` submodule)
|
||||
used to mark positive matches (:) in the alignment string
|
||||
when two different residues have a positive score.
|
||||
"""
|
||||
seqs = []
|
||||
names = []
|
||||
@ -2437,7 +2480,7 @@ class Alignment:
|
||||
for seq, row in zip(self.sequences, coordinates):
|
||||
seq = self._convert_sequence_string(seq)
|
||||
if seq is None:
|
||||
return self._format_generalized()
|
||||
return self._format_generalized(matrix)
|
||||
if row[0] > row[-1]: # mapped to reverse strand
|
||||
row[:] = len(seq) - row[:]
|
||||
seq = reverse_complement(seq)
|
||||
@ -2479,13 +2522,24 @@ class Alignment:
|
||||
c = "-"
|
||||
else:
|
||||
c = "."
|
||||
if matrix is not None and c1 != " " and c2 != " ":
|
||||
c1u, c2u = c1.upper(), c2.upper()
|
||||
if matrix[c1u, c2u] > 0:
|
||||
c = ":"
|
||||
pattern += c
|
||||
return f"{aligned_seq1}\n{pattern}\n{aligned_seq2}\n"
|
||||
|
||||
def _format_generalized(self):
|
||||
def _format_generalized(self, matrix=None):
|
||||
"""Return generalized string representation (PRIVATE).
|
||||
|
||||
Helper for self._format_pretty().
|
||||
|
||||
Arguments:
|
||||
- matrix - Optional; default=None
|
||||
A substitution matrix (typically from the
|
||||
`Bio.Align.substitution_matrices` submodule)
|
||||
used to mark positive matches (:) in the alignment string
|
||||
when two different residues have a positive score.
|
||||
"""
|
||||
seq1, seq2 = self.sequences
|
||||
aligned_seq1 = []
|
||||
@ -2540,6 +2594,10 @@ class Alignment:
|
||||
p = "|"
|
||||
else:
|
||||
p = "."
|
||||
if matrix is not None:
|
||||
c1u, c2u = c1.upper(), c2.upper()
|
||||
if matrix[c1u, c2u] > 0:
|
||||
p = ":"
|
||||
if m1 < m2:
|
||||
space = (m2 - m1) * " "
|
||||
s1 += space
|
||||
@ -3544,10 +3602,10 @@ class Alignment:
|
||||
start1, start2 = end1, end2
|
||||
return m
|
||||
|
||||
def counts(self, argument=None):
|
||||
def counts(self, scoring=None):
|
||||
"""Count the number of identities, mismatches, and gaps of an alignment.
|
||||
|
||||
This method takes a single optional argument, which can be either None
|
||||
This method takes a single optional argument named scoring, which can be either None
|
||||
(default), a substitution matrix, a wildcard character, or a pairwise
|
||||
aligner object:
|
||||
|
||||
@ -3631,7 +3689,7 @@ class Alignment:
|
||||
side of the alignment;
|
||||
- open_right_deletions - the number of deletion gaps opened on the right
|
||||
side of the alignment;
|
||||
- open_internal_insertions - the number of insertion gaps opaned in the
|
||||
- open_internal_insertions - the number of insertion gaps opened in the
|
||||
interior of the alignment;
|
||||
- open_internal_deletions - the number of deletion gaps opened in the
|
||||
interior of the alignment;
|
||||
@ -3671,15 +3729,15 @@ class Alignment:
|
||||
aligner = None
|
||||
wildcard = None
|
||||
substitution_matrix = None
|
||||
if isinstance(argument, PairwiseAligner):
|
||||
aligner = argument
|
||||
if isinstance(scoring, PairwiseAligner):
|
||||
aligner = scoring
|
||||
substitution_matrix = aligner.substitution_matrix
|
||||
elif isinstance(argument, str):
|
||||
wildcard = argument
|
||||
elif isinstance(argument, (np.ndarray, substitution_matrices.Array)):
|
||||
substitution_matrix = argument
|
||||
elif argument is not None:
|
||||
raise ValueError(f"unexpected argument {argument!r}")
|
||||
elif isinstance(scoring, str):
|
||||
wildcard = scoring
|
||||
elif isinstance(scoring, (np.ndarray, substitution_matrices.Array)):
|
||||
substitution_matrix = scoring
|
||||
elif scoring is not None:
|
||||
raise ValueError(f"unexpected argument {scoring!r}")
|
||||
if substitution_matrix is None:
|
||||
alphabet = []
|
||||
codec = "utf-32-le" if sys.byteorder == "little" else "utf-32-be"
|
||||
|
@ -287,6 +287,7 @@ please open an issue on GitHub or mention it on the mailing list.
|
||||
- Philip Capel <https://github.com/pcapel>
|
||||
- Phillip Garland <pgarland at gmail>
|
||||
- Pol Estecha <https://github.com/poleshe>
|
||||
- Rachel Stern <https://github.com/RachelStern20>
|
||||
- Ralf Stephan <https://github.com/rwst>
|
||||
- Rasmus Fonseca <https://github.com/RasmusFonseca>
|
||||
- rht <https://github.com/rht>
|
||||
|
@ -2427,8 +2427,8 @@ format used by PFAM:
|
||||
#=GF RN [1]
|
||||
#=GF RM 3130377
|
||||
#=GF RT Microsequence analysis of DNA-binding proteins 7a, 7b, and 7e
|
||||
#=GF RT from the archaebacterium Sulfolobus acidocaldarius.
|
||||
#=GF RA Choli T, Wittmann-Liebold B, Reinhardt R;
|
||||
#=GF RT from the archaebacterium Sulfolobus acidocaldarius.
|
||||
#=GF RA Choli T, Wittmann-Liebold B, Reinhardt R;
|
||||
#=GF RL J Biol Chem 1988;263:7087-7093.
|
||||
#=GF DR INTERPRO; IPR003212;
|
||||
#=GF DR SCOP; 1sso; fa;
|
||||
@ -2640,26 +2640,26 @@ source distribution):
|
||||
.. code:: text
|
||||
|
||||
3 384
|
||||
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ
|
||||
FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE
|
||||
FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS
|
||||
TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG
|
||||
GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV
|
||||
E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG
|
||||
YIYLRRGKNT CGVSNFVSTS II--
|
||||
ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR
|
||||
FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE
|
||||
FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS
|
||||
TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG
|
||||
GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI
|
||||
DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG
|
||||
YFKMEMGKNM CAIATCASYP VVAA
|
||||
CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH
|
||||
FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE
|
||||
IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS
|
||||
TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK
|
||||
GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT
|
||||
QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG
|
||||
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ
|
||||
FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE
|
||||
FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS
|
||||
TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG
|
||||
GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV
|
||||
E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG
|
||||
YIYLRRGKNT CGVSNFVSTS II--
|
||||
ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR
|
||||
FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE
|
||||
FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS
|
||||
TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG
|
||||
GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI
|
||||
DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG
|
||||
YFKMEMGKNM CAIATCASYP VVAA
|
||||
CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH
|
||||
FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE
|
||||
IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS
|
||||
TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK
|
||||
GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT
|
||||
QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG
|
||||
YFLIERGKNM CGLAACASYP IPLV
|
||||
|
||||
In the sequential format, the complete alignment for one sequence is
|
||||
@ -2671,32 +2671,32 @@ the Biopython source distribution):
|
||||
.. code:: text
|
||||
|
||||
3 384
|
||||
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ
|
||||
ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR
|
||||
CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH
|
||||
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ
|
||||
ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR
|
||||
CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH
|
||||
|
||||
FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE
|
||||
FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE
|
||||
FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE
|
||||
FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE
|
||||
FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE
|
||||
FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE
|
||||
|
||||
FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS
|
||||
FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS
|
||||
IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS
|
||||
FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS
|
||||
FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS
|
||||
IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS
|
||||
|
||||
TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG
|
||||
TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG
|
||||
TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK
|
||||
TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG
|
||||
TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG
|
||||
TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK
|
||||
|
||||
GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV
|
||||
GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI
|
||||
GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT
|
||||
GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV
|
||||
GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI
|
||||
GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT
|
||||
|
||||
E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG
|
||||
DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG
|
||||
QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG
|
||||
E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG
|
||||
DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG
|
||||
QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG
|
||||
|
||||
YIYLRRGKNT CGVSNFVSTS II--
|
||||
YFKMEMGKNM CAIATCASYP VVAA
|
||||
YIYLRRGKNT CGVSNFVSTS II--
|
||||
YFKMEMGKNM CAIATCASYP VVAA
|
||||
YFLIERGKNM CGLAACASYP IPLV
|
||||
|
||||
The parser in ``Bio.Align`` detects from the file contents if it is in
|
||||
@ -2843,7 +2843,7 @@ local pairwise sequence alignment (available as ``water.txt`` in the
|
||||
|
||||
|
||||
#---------------------------------------
|
||||
#---------------------------------------
|
||||
#---------------------------------------
|
||||
|
||||
As this output file contains only one alignment, we can use
|
||||
``Align.read`` to extract it directly. Here, instead we will use
|
||||
@ -3209,6 +3209,33 @@ using Python’s built-in ``format`` function writes a vulgar line:
|
||||
vulgar: gi|296143771|ref|NM_001180731.1| 0 1230 + gi|330443520|ref|NC_001136.10| 1319275 1318045 - 6146 M 1 1 C 3 3 M 1226 1226
|
||||
<BLANKLINE>
|
||||
|
||||
The ``Alignment.format()`` method also accepts an optional ``scoring`` argument.
|
||||
If you provide a substitution matrix (for example, ``scoring=M`` where
|
||||
``M = Bio.Align.substitution_matrices.load("NUC.4.4")``), the middle pattern
|
||||
line will reflect the substitution scores:
|
||||
|
||||
* ``|`` for identical residues,
|
||||
* ``:`` for substitutions with a positive score,
|
||||
* ``.`` for substitutions with a negative score,
|
||||
* ``-`` for gaps.
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> from Bio.Align import PairwiseAligner
|
||||
>>> from Bio.Align import substitution_matrices
|
||||
>>> M = substitution_matrices.load("NUC.4.4")
|
||||
>>> aligner = PairwiseAligner()
|
||||
>>> aligner.open_gap_score = -100
|
||||
>>> aligner.extend_gap_score = -100
|
||||
>>> aln = aligner.align("GATTACAT", "GATYACAC")[0]
|
||||
>>> print(aln.format("", scoring=M))
|
||||
target 0 GATTACAT 8
|
||||
0 |||:|||. 8
|
||||
query 0 GATYACAC 8
|
||||
<BLANKLINE>
|
||||
|
||||
Using the ``format`` method allows us to request either a vulgar line
|
||||
(default) or a cigar line:
|
||||
|
||||
|
1
NEWS.rst
1
NEWS.rst
@ -33,6 +33,7 @@ being used for additional metadata, typically from computational tools.
|
||||
Many thanks to the Biopython developers and community for making this release
|
||||
possible, especially the following contributors:
|
||||
|
||||
- Rachel Stern (first contribution)
|
||||
- Oliver Wissett (first contribution)
|
||||
- Samuel Prince (first contribution)
|
||||
|
||||
|
160
Tests/test_format_matrix_unittest.py
Normal file
160
Tests/test_format_matrix_unittest.py
Normal file
@ -0,0 +1,160 @@
|
||||
# Tests/test_format_matrix_unittest.py
|
||||
"""Unit tests for Alignment.format() with substitution matrices.
|
||||
|
||||
These tests cover the behavior of pretty-printed alignments when a
|
||||
substitution matrix (e.g., from Bio.Align.substitution_matrices) is
|
||||
supplied directly or via a PairwiseAligner object.
|
||||
|
||||
Conventions being tested:
|
||||
* '|' (pipe): identity (the same residue on both sequences).
|
||||
* ':' (colon): positive mismatch (substitution with a positive score).
|
||||
* '.' (dot): negative mismatch (substitution with a negative score).
|
||||
* '-' (dash): gap (insertion/deletion).
|
||||
|
||||
Test cases include:
|
||||
* NUC.4.4 matrix, where T~Y is a positive mismatch.
|
||||
* A BLASTN-like artificial matrix with +1 for matches and -1 for mismatches.
|
||||
* Case-insensitivity: lowercase residues behave the same as uppercase.
|
||||
* Handling of gaps in alignments.
|
||||
* A "mixed block" test where all four pattern characters appear at least once.
|
||||
|
||||
These tests ensure that the new feature of showing ':' for positive
|
||||
substitution scores is consistently applied across different input
|
||||
styles and substitution matrices.
|
||||
"""
|
||||
import unittest
|
||||
import numpy as np
|
||||
from Bio.Align import PairwiseAligner
|
||||
from Bio.Align.substitution_matrices import load, Array
|
||||
|
||||
|
||||
def _blastn_like_matrix():
|
||||
alphabet = "ACGTY"
|
||||
n = len(alphabet)
|
||||
data = np.full((n, n), -1, dtype=int)
|
||||
np.fill_diagonal(data, 1)
|
||||
return Array(alphabet=alphabet, dims=2, data=data)
|
||||
|
||||
|
||||
class TestFormatMatrix(unittest.TestCase):
|
||||
"""Unit tests for Alignment.format() with substitution matrices."""
|
||||
|
||||
def test_nuc44_gives_colon_for_positive_mismatch_TY(self):
|
||||
"""In NUC.4.4, T vs Y has a positive score -> expect ':' in the pattern."""
|
||||
M = load("NUC.4.4")
|
||||
aligner = PairwiseAligner()
|
||||
aligner.open_gap_score = -1
|
||||
aligner.extend_gap_score = -0.5
|
||||
aln = aligner.align("GATTACAT", "GATYACAC")[0]
|
||||
self.assertEqual(
|
||||
aln.format("", scoring=M),
|
||||
"""\
|
||||
target 0 GATTACAT 8
|
||||
0 |||:|||. 8
|
||||
query 0 GATYACAC 8
|
||||
""",
|
||||
)
|
||||
|
||||
def test_blastn_like_has_no_colon_only_pipes_for_identities(self):
|
||||
"""In a BLASTN-like +1/-1 matrix, mismatches are always negative -> no ':' expected."""
|
||||
M = _blastn_like_matrix()
|
||||
aligner = PairwiseAligner()
|
||||
aligner.open_gap_score = -1
|
||||
aligner.extend_gap_score = -0.5
|
||||
aln = aligner.align("GATTACAT", "GATYACAC")[0]
|
||||
self.assertEqual(
|
||||
aln.format("", scoring=M),
|
||||
"""\
|
||||
target 0 GATTACAT 8
|
||||
0 |||.|||. 8
|
||||
query 0 GATYACAC 8
|
||||
""",
|
||||
)
|
||||
|
||||
def test_positive_mismatch_colon_when_passing_aligner_object(self):
|
||||
"""Passing the aligner object with a substitution matrix should also yield ':' for positive mismatches."""
|
||||
M = load("NUC.4.4")
|
||||
aligner = PairwiseAligner()
|
||||
aligner.substitution_matrix = M
|
||||
aligner.open_gap_score = -10
|
||||
aligner.extend_gap_score = -2
|
||||
aln = aligner.align("GATTACAT", "GATYACAC")[0]
|
||||
self.assertEqual(
|
||||
aln.format("", scoring=M),
|
||||
"""\
|
||||
target 0 GATTACAT 8
|
||||
0 |||:|||. 8
|
||||
query 0 GATYACAC 8
|
||||
""",
|
||||
)
|
||||
|
||||
def test_lowercase_letters_are_case_insensitive_for_matrix_lookup(self):
|
||||
"""Matrix lookup should be case-insensitive (e.g., 't' vs 'y' behaves like 'T' vs 'Y')."""
|
||||
M = load("NUC.4.4")
|
||||
aligner = PairwiseAligner()
|
||||
aligner.open_gap_score = -10
|
||||
aligner.extend_gap_score = -2
|
||||
aln = aligner.align("t", "y")[0]
|
||||
self.assertEqual(
|
||||
aln.format("", scoring=M),
|
||||
"""\
|
||||
target 0 t 1
|
||||
0 : 1
|
||||
query 0 y 1
|
||||
""",
|
||||
)
|
||||
|
||||
def test_negative_mismatch_dot_with_blastn_like_matrix(self):
|
||||
"""In the BLASTN-like matrix, mismatches are negative -> expect '.' in the pattern."""
|
||||
M = _blastn_like_matrix()
|
||||
aligner = PairwiseAligner()
|
||||
aligner.open_gap_score = -10
|
||||
aligner.extend_gap_score = -2
|
||||
aln = aligner.align("A", "C")[0]
|
||||
self.assertEqual(
|
||||
aln.format("", scoring=M),
|
||||
"""\
|
||||
target 0 A 1
|
||||
0 . 1
|
||||
query 0 C 1
|
||||
""",
|
||||
)
|
||||
|
||||
def test_gap_is_dash_in_pattern(self):
|
||||
"""Gaps in the alignment should always appear as '-' in the pattern line."""
|
||||
M = load("NUC.4.4")
|
||||
aligner = PairwiseAligner()
|
||||
aligner.open_gap_score = -1
|
||||
aligner.extend_gap_score = -0.5
|
||||
aln = aligner.align("AC", "AGC")[0]
|
||||
self.assertEqual(
|
||||
aln.format("", scoring=M),
|
||||
"""\
|
||||
target 0 A-C 2
|
||||
0 |-| 3
|
||||
query 0 AGC 3
|
||||
""",
|
||||
)
|
||||
|
||||
def test_mixed_block_contains_expected_symbols(self):
|
||||
"""Construct an alignment that produces all symbols ('|', ':', '.', '-') at least once in the pattern."""
|
||||
M = load("NUC.4.4")
|
||||
aligner = PairwiseAligner()
|
||||
aligner.substitution_matrix = M
|
||||
aligner.open_gap_score = -10
|
||||
aligner.extend_gap_score = -2
|
||||
seq1 = "TTTG"
|
||||
seq2 = "TYGG"
|
||||
aln = aligner.align(seq1, seq2)[0]
|
||||
self.assertEqual(
|
||||
aln.format("", scoring=M),
|
||||
"""\
|
||||
target 0 TTTG 4
|
||||
0 |:.| 4
|
||||
query 0 TYGG 4
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
Reference in New Issue
Block a user