diff --git a/Bio/Align/__init__.py b/Bio/Align/__init__.py index b7bb312e9..fc7a0a50f 100644 --- a/Bio/Align/__init__.py +++ b/Bio/Align/__init__.py @@ -2184,6 +2184,16 @@ class Alignment: create a human-readable representation of the alignment, or any of the alignment file formats supported by `Bio.Align` (some have not yet been implemented). + - scoring - Optional keyword-only parameter; default=None. + If provided, can be: + + - A substitution matrix (typically from the + `Bio.Align.substitution_matrices` submodule) + used to mark positive matches (:) in the alignment string + when two different residues have a positive score. + + - A PairwiseAligner object, in which case its substitution + matrix and settings are used for determining positive matches. All other arguments are passed to the format-specific writer functions: - mask - PSL format only. Specify if repeat regions in the target @@ -2202,8 +2212,23 @@ class Alignment: the alignment and include it in the output. If False (default), do not include the MD tag in the output. """ + scoring = kwargs.pop("scoring", None) + substitution_matrix = None + if scoring is None and args: + first = args[0] + if isinstance(first, PairwiseAligner): + substitution_matrix = first.substitution_matrix + args = args[1:] + elif isinstance(first, (np.ndarray, substitution_matrices.Array)): + substitution_matrix = first + args = args[1:] + if substitution_matrix is None and scoring is not None: + if isinstance(scoring, PairwiseAligner): + substitution_matrix = scoring.substitution_matrix + elif isinstance(scoring, (np.ndarray, substitution_matrices.Array)): + substitution_matrix = scoring if fmt == "": - return self._format_pretty() + return self._format_pretty(substitution_matrix) module = _load(fmt) if module.AlignmentIterator.mode == "b": raise ValueError(f"{fmt} is a binary file format") @@ -2215,10 +2240,17 @@ class Alignment: ) from None return writer.format_alignment(self) - def _format_pretty(self): + def _format_pretty(self, matrix=None): """Return default string representation (PRIVATE). Helper for self.format(). + + Arguments: + - matrix - Optional; default=None + A substitution matrix (typically from the + `Bio.Align.substitution_matrices` submodule) + used to mark positive matches (:) in the alignment string + when two different residues have a positive score. """ n = len(self.sequences) if n == 2: @@ -2271,7 +2303,7 @@ class Alignment: row[:] = end - positions if isinstance(seq, str): if not seq.isascii(): - return self._format_unicode() + return self._format_unicode(matrix) elif isinstance(seq, (Seq, MutableSeq)): try: seq = bytes(seq) @@ -2282,7 +2314,7 @@ class Alignment: seq = s seq = seq.decode() else: - return self._format_generalized() + return self._format_generalized(matrix) seqs.append(seq) minstep = steps.min(0) maxstep = steps.max(0) @@ -2381,6 +2413,10 @@ class Alignment: c = "-" else: c = "." + if matrix is not None and c1 != " " and c2 != " ": + c1u, c2u = c1.upper(), c2.upper() + if matrix[c1u, c2u] > 0: + c = ":" pattern += c pattern_line = " %9d %s" % (position, pattern) pattern_lines.append(pattern_line) @@ -2426,10 +2462,17 @@ class Alignment: blocks.append(block) return "\n".join(blocks) - def _format_unicode(self): + def _format_unicode(self, matrix=None): """Return default string representation (PRIVATE). Helper for self.format(). + + Arguments: + - matrix - Optional; default=None + A substitution matrix (typically from the + `Bio.Align.substitution_matrices` submodule) + used to mark positive matches (:) in the alignment string + when two different residues have a positive score. """ seqs = [] names = [] @@ -2437,7 +2480,7 @@ class Alignment: for seq, row in zip(self.sequences, coordinates): seq = self._convert_sequence_string(seq) if seq is None: - return self._format_generalized() + return self._format_generalized(matrix) if row[0] > row[-1]: # mapped to reverse strand row[:] = len(seq) - row[:] seq = reverse_complement(seq) @@ -2479,13 +2522,24 @@ class Alignment: c = "-" else: c = "." + if matrix is not None and c1 != " " and c2 != " ": + c1u, c2u = c1.upper(), c2.upper() + if matrix[c1u, c2u] > 0: + c = ":" pattern += c return f"{aligned_seq1}\n{pattern}\n{aligned_seq2}\n" - def _format_generalized(self): + def _format_generalized(self, matrix=None): """Return generalized string representation (PRIVATE). Helper for self._format_pretty(). + + Arguments: + - matrix - Optional; default=None + A substitution matrix (typically from the + `Bio.Align.substitution_matrices` submodule) + used to mark positive matches (:) in the alignment string + when two different residues have a positive score. """ seq1, seq2 = self.sequences aligned_seq1 = [] @@ -2540,6 +2594,10 @@ class Alignment: p = "|" else: p = "." + if matrix is not None: + c1u, c2u = c1.upper(), c2.upper() + if matrix[c1u, c2u] > 0: + p = ":" if m1 < m2: space = (m2 - m1) * " " s1 += space @@ -3544,10 +3602,10 @@ class Alignment: start1, start2 = end1, end2 return m - def counts(self, argument=None): + def counts(self, scoring=None): """Count the number of identities, mismatches, and gaps of an alignment. - This method takes a single optional argument, which can be either None + This method takes a single optional argument named scoring, which can be either None (default), a substitution matrix, a wildcard character, or a pairwise aligner object: @@ -3631,7 +3689,7 @@ class Alignment: side of the alignment; - open_right_deletions - the number of deletion gaps opened on the right side of the alignment; - - open_internal_insertions - the number of insertion gaps opaned in the + - open_internal_insertions - the number of insertion gaps opened in the interior of the alignment; - open_internal_deletions - the number of deletion gaps opened in the interior of the alignment; @@ -3671,15 +3729,15 @@ class Alignment: aligner = None wildcard = None substitution_matrix = None - if isinstance(argument, PairwiseAligner): - aligner = argument + if isinstance(scoring, PairwiseAligner): + aligner = scoring substitution_matrix = aligner.substitution_matrix - elif isinstance(argument, str): - wildcard = argument - elif isinstance(argument, (np.ndarray, substitution_matrices.Array)): - substitution_matrix = argument - elif argument is not None: - raise ValueError(f"unexpected argument {argument!r}") + elif isinstance(scoring, str): + wildcard = scoring + elif isinstance(scoring, (np.ndarray, substitution_matrices.Array)): + substitution_matrix = scoring + elif scoring is not None: + raise ValueError(f"unexpected argument {scoring!r}") if substitution_matrix is None: alphabet = [] codec = "utf-32-le" if sys.byteorder == "little" else "utf-32-be" diff --git a/CONTRIB.rst b/CONTRIB.rst index b13d1bdd1..47f7ed8f1 100644 --- a/CONTRIB.rst +++ b/CONTRIB.rst @@ -287,6 +287,7 @@ please open an issue on GitHub or mention it on the mailing list. - Philip Capel - Phillip Garland - Pol Estecha +- Rachel Stern - Ralf Stephan - Rasmus Fonseca - rht diff --git a/Doc/Tutorial/chapter_align.rst b/Doc/Tutorial/chapter_align.rst index b43f46f08..cf19daf2e 100644 --- a/Doc/Tutorial/chapter_align.rst +++ b/Doc/Tutorial/chapter_align.rst @@ -2427,8 +2427,8 @@ format used by PFAM: #=GF RN [1] #=GF RM 3130377 #=GF RT Microsequence analysis of DNA-binding proteins 7a, 7b, and 7e - #=GF RT from the archaebacterium Sulfolobus acidocaldarius. - #=GF RA Choli T, Wittmann-Liebold B, Reinhardt R; + #=GF RT from the archaebacterium Sulfolobus acidocaldarius. + #=GF RA Choli T, Wittmann-Liebold B, Reinhardt R; #=GF RL J Biol Chem 1988;263:7087-7093. #=GF DR INTERPRO; IPR003212; #=GF DR SCOP; 1sso; fa; @@ -2640,26 +2640,26 @@ source distribution): .. code:: text 3 384 - CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ - FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE - FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS - TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG - GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV - E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG - YIYLRRGKNT CGVSNFVSTS II-- - ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR - FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE - FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS - TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG - GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI - DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG - YFKMEMGKNM CAIATCASYP VVAA - CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH - FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE - IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS - TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK - GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT - QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG + CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ + FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE + FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS + TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG + GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV + E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG + YIYLRRGKNT CGVSNFVSTS II-- + ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR + FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE + FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS + TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG + GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI + DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG + YFKMEMGKNM CAIATCASYP VVAA + CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH + FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE + IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS + TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK + GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT + QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG YFLIERGKNM CGLAACASYP IPLV In the sequential format, the complete alignment for one sequence is @@ -2671,32 +2671,32 @@ the Biopython source distribution): .. code:: text 3 384 - CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ - ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR - CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH + CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ + ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR + CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH - FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE - FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE - FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE + FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE + FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE + FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE - FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS - FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS - IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS + FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS + FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS + IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS - TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG - TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG - TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK + TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG + TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG + TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK - GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV - GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI - GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT + GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV + GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI + GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT - E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG - DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG - QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG + E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG + DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG + QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG - YIYLRRGKNT CGVSNFVSTS II-- - YFKMEMGKNM CAIATCASYP VVAA + YIYLRRGKNT CGVSNFVSTS II-- + YFKMEMGKNM CAIATCASYP VVAA YFLIERGKNM CGLAACASYP IPLV The parser in ``Bio.Align`` detects from the file contents if it is in @@ -2843,7 +2843,7 @@ local pairwise sequence alignment (available as ``water.txt`` in the #--------------------------------------- - #--------------------------------------- + #--------------------------------------- As this output file contains only one alignment, we can use ``Align.read`` to extract it directly. Here, instead we will use @@ -3209,6 +3209,33 @@ using Python’s built-in ``format`` function writes a vulgar line: vulgar: gi|296143771|ref|NM_001180731.1| 0 1230 + gi|330443520|ref|NC_001136.10| 1319275 1318045 - 6146 M 1 1 C 3 3 M 1226 1226 +The ``Alignment.format()`` method also accepts an optional ``scoring`` argument. +If you provide a substitution matrix (for example, ``scoring=M`` where +``M = Bio.Align.substitution_matrices.load("NUC.4.4")``), the middle pattern +line will reflect the substitution scores: + +* ``|`` for identical residues, +* ``:`` for substitutions with a positive score, +* ``.`` for substitutions with a negative score, +* ``-`` for gaps. + +.. cont-doctest + +.. code:: pycon + + >>> from Bio.Align import PairwiseAligner + >>> from Bio.Align import substitution_matrices + >>> M = substitution_matrices.load("NUC.4.4") + >>> aligner = PairwiseAligner() + >>> aligner.open_gap_score = -100 + >>> aligner.extend_gap_score = -100 + >>> aln = aligner.align("GATTACAT", "GATYACAC")[0] + >>> print(aln.format("", scoring=M)) + target 0 GATTACAT 8 + 0 |||:|||. 8 + query 0 GATYACAC 8 + + Using the ``format`` method allows us to request either a vulgar line (default) or a cigar line: diff --git a/NEWS.rst b/NEWS.rst index a30b37380..ff115d74d 100644 --- a/NEWS.rst +++ b/NEWS.rst @@ -33,6 +33,7 @@ being used for additional metadata, typically from computational tools. Many thanks to the Biopython developers and community for making this release possible, especially the following contributors: +- Rachel Stern (first contribution) - Oliver Wissett (first contribution) - Samuel Prince (first contribution) diff --git a/Tests/test_format_matrix_unittest.py b/Tests/test_format_matrix_unittest.py new file mode 100644 index 000000000..7c868a6a9 --- /dev/null +++ b/Tests/test_format_matrix_unittest.py @@ -0,0 +1,160 @@ +# Tests/test_format_matrix_unittest.py +"""Unit tests for Alignment.format() with substitution matrices. + +These tests cover the behavior of pretty-printed alignments when a +substitution matrix (e.g., from Bio.Align.substitution_matrices) is +supplied directly or via a PairwiseAligner object. + +Conventions being tested: +* '|' (pipe): identity (the same residue on both sequences). +* ':' (colon): positive mismatch (substitution with a positive score). +* '.' (dot): negative mismatch (substitution with a negative score). +* '-' (dash): gap (insertion/deletion). + +Test cases include: +* NUC.4.4 matrix, where T~Y is a positive mismatch. +* A BLASTN-like artificial matrix with +1 for matches and -1 for mismatches. +* Case-insensitivity: lowercase residues behave the same as uppercase. +* Handling of gaps in alignments. +* A "mixed block" test where all four pattern characters appear at least once. + +These tests ensure that the new feature of showing ':' for positive +substitution scores is consistently applied across different input +styles and substitution matrices. +""" +import unittest +import numpy as np +from Bio.Align import PairwiseAligner +from Bio.Align.substitution_matrices import load, Array + + +def _blastn_like_matrix(): + alphabet = "ACGTY" + n = len(alphabet) + data = np.full((n, n), -1, dtype=int) + np.fill_diagonal(data, 1) + return Array(alphabet=alphabet, dims=2, data=data) + + +class TestFormatMatrix(unittest.TestCase): + """Unit tests for Alignment.format() with substitution matrices.""" + + def test_nuc44_gives_colon_for_positive_mismatch_TY(self): + """In NUC.4.4, T vs Y has a positive score -> expect ':' in the pattern.""" + M = load("NUC.4.4") + aligner = PairwiseAligner() + aligner.open_gap_score = -1 + aligner.extend_gap_score = -0.5 + aln = aligner.align("GATTACAT", "GATYACAC")[0] + self.assertEqual( + aln.format("", scoring=M), + """\ +target 0 GATTACAT 8 + 0 |||:|||. 8 +query 0 GATYACAC 8 +""", + ) + + def test_blastn_like_has_no_colon_only_pipes_for_identities(self): + """In a BLASTN-like +1/-1 matrix, mismatches are always negative -> no ':' expected.""" + M = _blastn_like_matrix() + aligner = PairwiseAligner() + aligner.open_gap_score = -1 + aligner.extend_gap_score = -0.5 + aln = aligner.align("GATTACAT", "GATYACAC")[0] + self.assertEqual( + aln.format("", scoring=M), + """\ +target 0 GATTACAT 8 + 0 |||.|||. 8 +query 0 GATYACAC 8 +""", + ) + + def test_positive_mismatch_colon_when_passing_aligner_object(self): + """Passing the aligner object with a substitution matrix should also yield ':' for positive mismatches.""" + M = load("NUC.4.4") + aligner = PairwiseAligner() + aligner.substitution_matrix = M + aligner.open_gap_score = -10 + aligner.extend_gap_score = -2 + aln = aligner.align("GATTACAT", "GATYACAC")[0] + self.assertEqual( + aln.format("", scoring=M), + """\ +target 0 GATTACAT 8 + 0 |||:|||. 8 +query 0 GATYACAC 8 +""", + ) + + def test_lowercase_letters_are_case_insensitive_for_matrix_lookup(self): + """Matrix lookup should be case-insensitive (e.g., 't' vs 'y' behaves like 'T' vs 'Y').""" + M = load("NUC.4.4") + aligner = PairwiseAligner() + aligner.open_gap_score = -10 + aligner.extend_gap_score = -2 + aln = aligner.align("t", "y")[0] + self.assertEqual( + aln.format("", scoring=M), + """\ +target 0 t 1 + 0 : 1 +query 0 y 1 +""", + ) + + def test_negative_mismatch_dot_with_blastn_like_matrix(self): + """In the BLASTN-like matrix, mismatches are negative -> expect '.' in the pattern.""" + M = _blastn_like_matrix() + aligner = PairwiseAligner() + aligner.open_gap_score = -10 + aligner.extend_gap_score = -2 + aln = aligner.align("A", "C")[0] + self.assertEqual( + aln.format("", scoring=M), + """\ +target 0 A 1 + 0 . 1 +query 0 C 1 +""", + ) + + def test_gap_is_dash_in_pattern(self): + """Gaps in the alignment should always appear as '-' in the pattern line.""" + M = load("NUC.4.4") + aligner = PairwiseAligner() + aligner.open_gap_score = -1 + aligner.extend_gap_score = -0.5 + aln = aligner.align("AC", "AGC")[0] + self.assertEqual( + aln.format("", scoring=M), + """\ +target 0 A-C 2 + 0 |-| 3 +query 0 AGC 3 +""", + ) + + def test_mixed_block_contains_expected_symbols(self): + """Construct an alignment that produces all symbols ('|', ':', '.', '-') at least once in the pattern.""" + M = load("NUC.4.4") + aligner = PairwiseAligner() + aligner.substitution_matrix = M + aligner.open_gap_score = -10 + aligner.extend_gap_score = -2 + seq1 = "TTTG" + seq2 = "TYGG" + aln = aligner.align(seq1, seq2)[0] + self.assertEqual( + aln.format("", scoring=M), + """\ +target 0 TTTG 4 + 0 |:.| 4 +query 0 TYGG 4 +""", + ) + + +if __name__ == "__main__": + unittest.main()