diff --git a/Bio/Align/emboss.py b/Bio/Align/emboss.py index 96b739645..38a1fccc7 100644 --- a/Bio/Align/emboss.py +++ b/Bio/Align/emboss.py @@ -11,7 +11,7 @@ for example from the needle, water, and stretcher tools. """ from Bio.Align import Alignment from Bio.Align import interfaces -from Bio.Seq import Seq +from Bio.Seq import Seq, reverse_complement from Bio.SeqRecord import SeqRecord @@ -99,6 +99,7 @@ class AlignmentIterator(interfaces.AlignmentIterator): aligned_sequences = [""] * number_of_sequences consensus = "" starts = [0] * number_of_sequences + ends = [0] * number_of_sequences column = 0 index = 0 continue @@ -171,14 +172,19 @@ class AlignmentIterator(interfaces.AlignmentIterator): n = len(sequences) for i in range(n): start = starts[i] - if start == 0: - sequence = Seq(sequences[i]) - else: + end = ends[i] + if start < end: coordinates[i, :] += start + data = sequences[i] + else: + start, end = end, start + coordinates[i, :] = end - coordinates[i, :] + data = reverse_complement(sequences[i]) + if start == 0: + sequence = Seq(data) + else: # create a partially defined sequence - length = start + len(sequences[i]) - data = {start: sequences[i]} - sequence = Seq(data, length=length) + sequence = Seq({start: data}, length=end) record = SeqRecord(sequence, identifiers[i]) records.append(record) alignment = Alignment(records, coordinates) @@ -198,21 +204,43 @@ class AlignmentIterator(interfaces.AlignmentIterator): identifier, start = prefix.split(None, 1) assert identifiers[index].startswith(identifier) aligned_sequence, end = line[21:].split(None, 1) - start = int(start) - 1 # Python counting + start = int(start) end = int(end) length = len(sequences[index]) sequence = aligned_sequence.replace("-", "") if length == 0 and len(sequence) > 0: + if start < end: + start -= 1 # Python counting + assert end == start + len(sequence) + else: + end -= 1 # Python counting + assert end == start - len(sequence) # Record the start starts[index] = start else: - if ( - self.metadata["Align_format"] == "srspair" - and len(sequence) == 0 - ): - start += 1 - assert start == starts[index] + length - assert end == start + len(sequence) + if starts[index] <= ends[index]: + # forward strand + if ( + self.metadata["Align_format"] == "srspair" + and len(sequence) == 0 + ): + assert start == ends[index] + assert end == start + else: + start -= 1 + assert end == start + len(sequence) + else: + if ( + self.metadata["Align_format"] == "srspair" + and len(sequence) == 0 + ): + assert start - 1 == ends[index] + assert end == start + else: + end -= 1 + assert end == start - len(sequence) + # Record the end + ends[index] = end sequences[index] += sequence aligned_sequences[index] += aligned_sequence if index == 0: diff --git a/Tests/Emboss/water_reverse1.txt b/Tests/Emboss/water_reverse1.txt new file mode 100644 index 000000000..d2acc5c29 --- /dev/null +++ b/Tests/Emboss/water_reverse1.txt @@ -0,0 +1,47 @@ +######################################## +# Program: water +# Rundate: Sat 22 Oct 2022 23:47:41 +# Commandline: water +# -asequence seqA.fa +# -bsequence seqB.fa +# -gapopen 0.001 +# -gapextend 0.001 +# -sreverse1 +# -outfile water_reverse1.txt +# Align_format: srspair +# Report_file: water_reverse1.txt +######################################## + +#======================================= +# +# Aligned_sequences: 2 +# 1: seqA +# 2: seqB +# Matrix: EDNAFULL +# Gap_penalty: 0.001 +# Extend_penalty: 0.001 +# +# Length: 121 +# Identity: 32/121 (26.4%) +# Similarity: 32/121 (26.4%) +# Gaps: 89/121 (73.6%) +# Score: 159.911 +# +# +#======================================= + +seqA 121 GGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 72 + ||||||||||||||||||| +seqB 1 GGGGGGGGGGGGGGGGGGG------------------------------- 19 + +seqA 71 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 22 + +seqB 19 -------------------------------------------------- 19 + +seqA 21 TTTTTTTTCCCCCCCCCCCCC 1 + ||||||||||||| +seqB 20 --------CCCCCCCCCCCCC 32 + + +#--------------------------------------- +#--------------------------------------- diff --git a/Tests/Emboss/water_reverse2.txt b/Tests/Emboss/water_reverse2.txt new file mode 100644 index 000000000..1338a260a --- /dev/null +++ b/Tests/Emboss/water_reverse2.txt @@ -0,0 +1,47 @@ +######################################## +# Program: water +# Rundate: Sun 23 Oct 2022 00:06:18 +# Commandline: water +# -asequence seqA.fa +# -bsequence seqB.fa +# -gapopen 0.001 +# -gapextend 0.001 +# -sreverse2 +# -outfile water_reverse2.txt +# Align_format: srspair +# Report_file: water_reverse2.txt +######################################## + +#======================================= +# +# Aligned_sequences: 2 +# 1: seqA +# 2: seqB +# Matrix: EDNAFULL +# Gap_penalty: 0.001 +# Extend_penalty: 0.001 +# +# Length: 121 +# Identity: 32/121 (26.4%) +# Similarity: 32/121 (26.4%) +# Gaps: 89/121 (73.6%) +# Score: 159.911 +# +# +#======================================= + +seqA 1 GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 50 + ||||||||||||| +seqB 32 GGGGGGGGGGGGG------------------------------------- 20 + +seqA 51 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 100 + +seqB 20 -------------------------------------------------- 20 + +seqA 101 AACCCCCCCCCCCCCCCCCCC 121 + ||||||||||||||||||| +seqB 19 --CCCCCCCCCCCCCCCCCCC 1 + + +#--------------------------------------- +#--------------------------------------- diff --git a/Tests/Emboss/water_reverse3.txt b/Tests/Emboss/water_reverse3.txt new file mode 100644 index 000000000..77ad54de6 --- /dev/null +++ b/Tests/Emboss/water_reverse3.txt @@ -0,0 +1,39 @@ +######################################## +# Program: water +# Rundate: Sat 22 Oct 2022 22:56:03 +# Commandline: water +# -asequence seqA.fa +# -bsequence seqB.fa +# -gapopen 1 +# -gapextend 0.5 +# -sreverse1 +# -outfile water_reverse3.txt +# Align_format: srspair +# Report_file: water_reverse3.txt +######################################## + +#======================================= +# +# Aligned_sequences: 2 +# 1: seqA +# 2: seqB +# Matrix: EDNAFULL +# Gap_penalty: 1.0 +# Extend_penalty: 0.5 +# +# Length: 19 +# Identity: 16/19 (84.2%) +# Similarity: 16/19 (84.2%) +# Gaps: 3/19 (15.8%) +# Score: 77.5 +# +# +#======================================= + +seqA 20 TTTTTTTAAA-CCGGGCCC 3 + ||||||| | |||||||| +seqB 3 TTTTTTT--ACCCGGGCCC 19 + + +#--------------------------------------- +#--------------------------------------- diff --git a/Tests/Emboss/water_reverse4.txt b/Tests/Emboss/water_reverse4.txt new file mode 100644 index 000000000..ca58920a0 --- /dev/null +++ b/Tests/Emboss/water_reverse4.txt @@ -0,0 +1,39 @@ +######################################## +# Program: water +# Rundate: Sat 22 Oct 2022 22:56:15 +# Commandline: water +# -asequence seqA.fa +# -bsequence seqB.fa +# -gapopen 1 +# -gapextend 0.5 +# -sreverse2 +# -outfile water_reverse4.txt +# Align_format: srspair +# Report_file: water_reverse4.txt +######################################## + +#======================================= +# +# Aligned_sequences: 2 +# 1: seqA +# 2: seqB +# Matrix: EDNAFULL +# Gap_penalty: 1.0 +# Extend_penalty: 0.5 +# +# Length: 19 +# Identity: 16/19 (84.2%) +# Similarity: 16/19 (84.2%) +# Gaps: 3/19 (15.8%) +# Score: 77.5 +# +# +#======================================= + +seqA 3 GGGCCCGGTT-TAAAAAAA 20 + |||||||| |||||||| +seqB 19 GGGCCCGG--GTAAAAAAA 3 + + +#--------------------------------------- +#--------------------------------------- diff --git a/Tests/test_Align_emboss.py b/Tests/test_Align_emboss.py index 6538bcea0..6d4128ab3 100644 --- a/Tests/test_Align_emboss.py +++ b/Tests/test_Align_emboss.py @@ -1214,6 +1214,223 @@ numpy.array([['K', 'I', 'L', 'I', 'V', 'D', 'D', 'Q', 'Y', 'G', 'I', 'R', 'I', with self.assertRaises(StopIteration): next(alignments) + def test_water_reverse1(self): + # water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse1 -outfile water_reverse1.txt + path = "Emboss/water_reverse1.txt" + alignments = Align.parse(path, "emboss") + self.assertEqual(alignments.metadata["Program"], "water") + self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 23:47:41") + self.assertEqual( + alignments.metadata["Command line"], + "water -asequence seqA.fa -bsequence seqB.fa -gapopen 0.001 -gapextend 0.001 -sreverse1 -outfile water_reverse1.txt", + ) + self.assertEqual(alignments.metadata["Align_format"], "srspair") + self.assertEqual(alignments.metadata["Report_file"], "water_reverse1.txt") + + alignment = next(alignments) + self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL") + self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 0.001) + self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.001) + self.assertEqual(alignment.annotations["Identity"], 32) + self.assertEqual(alignment.annotations["Similarity"], 32) + self.assertEqual(alignment.annotations["Gaps"], 89) + self.assertAlmostEqual(alignment.annotations["Score"], 159.911) + self.assertEqual(len(alignment), 2) + self.assertEqual(alignment.shape, (2, 121)) + self.assertEqual(alignment.sequences[0].id, "seqA") + self.assertEqual(alignment.sequences[1].id, "seqB") + self.assertEqual( + alignment.sequences[0].seq, + "GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC", + ) + self.assertEqual( + alignment.sequences[1].seq, + "GGGGGGGGGGGGGGGGGGGCCCCCCCCCCCCC", + ) + self.assertTrue( + numpy.array_equal( + alignment.coordinates, + # fmt: off +# flake8: noqa + numpy.array([[121, 102, 13, 0], + [ 0, 19, 19, 32]]) + # fmt: on + ) + ) + self.assertEqual( + alignment[0], + "GGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCC", + ) + self.assertEqual( + alignment[1], + "GGGGGGGGGGGGGGGGGGG-----------------------------------------------------------------------------------------CCCCCCCCCCCCC", + ) + self.assertEqual( + alignment.column_annotations["emboss_consensus"], + "||||||||||||||||||| |||||||||||||", + ) + with self.assertRaises(StopIteration): + next(alignments) + + def test_water_reverse2(self): + # water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse2 -outfile water_reverse2.txt + path = "Emboss/water_reverse2.txt" + alignments = Align.parse(path, "emboss") + self.assertEqual(alignments.metadata["Program"], "water") + self.assertEqual(alignments.metadata["Rundate"], "Sun 23 Oct 2022 00:06:18") + self.assertEqual( + alignments.metadata["Command line"], + "water -asequence seqA.fa -bsequence seqB.fa -gapopen 0.001 -gapextend 0.001 -sreverse2 -outfile water_reverse2.txt", + ) + self.assertEqual(alignments.metadata["Align_format"], "srspair") + self.assertEqual(alignments.metadata["Report_file"], "water_reverse2.txt") + alignment = next(alignments) + self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL") + self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 0.001) + self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.001) + self.assertEqual(alignment.annotations["Identity"], 32) + self.assertEqual(alignment.annotations["Similarity"], 32) + self.assertEqual(alignment.annotations["Gaps"], 89) + self.assertAlmostEqual(alignment.annotations["Score"], 159.911) + self.assertEqual(len(alignment), 2) + self.assertEqual(alignment.shape, (2, 121)) + self.assertEqual(alignment.sequences[0].id, "seqA") + self.assertEqual(alignment.sequences[1].id, "seqB") + self.assertEqual( + alignment.sequences[0].seq, + "GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC", + ) + self.assertEqual( + alignment.sequences[1].seq, + "GGGGGGGGGGGGGGGGGGGCCCCCCCCCCCCC", + ) + self.assertTrue( + numpy.array_equal( + alignment.coordinates, + # fmt: off + numpy.array([[ 0, 13, 102, 121], + [32, 19, 19, 0]]) + # fmt: on + ) + ) + self.assertEqual( + alignment[0], + "GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC", + ) + self.assertEqual( + alignment[1], + "GGGGGGGGGGGGG-----------------------------------------------------------------------------------------CCCCCCCCCCCCCCCCCCC", + ) + self.assertEqual( + alignment.column_annotations["emboss_consensus"], + "||||||||||||| |||||||||||||||||||", + ) + with self.assertRaises(StopIteration): + next(alignments) + + def test_water_reverse3(self): + # water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse1 -outfile water_reverse3.txt + path = "Emboss/water_reverse3.txt" + alignments = Align.parse(path, "emboss") + self.assertEqual(alignments.metadata["Program"], "water") + self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 22:56:03") + self.assertEqual( + alignments.metadata["Command line"], + "water -asequence seqA.fa -bsequence seqB.fa -gapopen 1 -gapextend 0.5 -sreverse1 -outfile water_reverse3.txt", + ) + self.assertEqual(alignments.metadata["Align_format"], "srspair") + self.assertEqual(alignments.metadata["Report_file"], "water_reverse3.txt") + + alignment = next(alignments) + self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL") + self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 1.0) + self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.5) + self.assertEqual(alignment.annotations["Identity"], 16) + self.assertEqual(alignment.annotations["Similarity"], 16) + self.assertEqual(alignment.annotations["Gaps"], 3) + self.assertAlmostEqual(alignment.annotations["Score"], 77.5) + self.assertEqual(len(alignment), 2) + self.assertEqual(alignment.shape, (2, 19)) + self.assertEqual(alignment.sequences[0].id, "seqA") + self.assertEqual(alignment.sequences[1].id, "seqB") + self.assertEqual( + repr(alignment.sequences[0].seq), + "Seq({2: 'GGGCCCGGTTTAAAAAAA'}, length=20)", + ) + self.assertEqual( + repr(alignment.sequences[1].seq), + "Seq({2: 'TTTTTTTACCCGGGCCC'}, length=19)", + ) + self.assertTrue( + numpy.array_equal( + alignment.coordinates, + # fmt: off +# flake8: noqa + numpy.array([[20, 13, 11, 10, 10, 2], + [ 2, 9, 9, 10, 11, 19]]) + # fmt: on + ) + ) + self.assertEqual(alignment[0], "TTTTTTTAAA-CCGGGCCC") + self.assertEqual(alignment[1], "TTTTTTT--ACCCGGGCCC") + self.assertEqual( + alignment.column_annotations["emboss_consensus"], + "||||||| | ||||||||", + ) + with self.assertRaises(StopIteration): + next(alignments) + + def test_water_reverse4(self): + # water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse2 -outfile water_reverse4.txt + path = "Emboss/water_reverse4.txt" + alignments = Align.parse(path, "emboss") + self.assertEqual(alignments.metadata["Program"], "water") + self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 22:56:15") + self.assertEqual( + alignments.metadata["Command line"], + "water -asequence seqA.fa -bsequence seqB.fa -gapopen 1 -gapextend 0.5 -sreverse2 -outfile water_reverse4.txt", + ) + self.assertEqual(alignments.metadata["Align_format"], "srspair") + self.assertEqual(alignments.metadata["Report_file"], "water_reverse4.txt") + alignment = next(alignments) + self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL") + self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 1.0) + self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.5) + self.assertEqual(alignment.annotations["Identity"], 16) + self.assertEqual(alignment.annotations["Similarity"], 16) + self.assertEqual(alignment.annotations["Gaps"], 3) + self.assertAlmostEqual(alignment.annotations["Score"], 77.5) + self.assertEqual(len(alignment), 2) + self.assertEqual(alignment.shape, (2, 19)) + self.assertEqual(alignment.sequences[0].id, "seqA") + self.assertEqual(alignment.sequences[1].id, "seqB") + self.assertEqual( + repr(alignment.sequences[0].seq), + "Seq({2: 'GGGCCCGGTTTAAAAAAA'}, length=20)", + ) + self.assertEqual( + repr(alignment.sequences[1].seq), + "Seq({2: 'TTTTTTTACCCGGGCCC'}, length=19)", + ) + self.assertTrue( + numpy.array_equal( + alignment.coordinates, + # fmt: off +# flake8: noqa + numpy.array([[ 2, 10, 12, 12, 20], + [19, 11, 11, 10, 2]]) + # fmt: on + ) + ) + self.assertEqual(alignment[0], "GGGCCCGGTT-TAAAAAAA") + self.assertEqual(alignment[1], "GGGCCCGG--GTAAAAAAA") + self.assertEqual( + alignment.column_annotations["emboss_consensus"], + "|||||||| ||||||||", + ) + with self.assertRaises(StopIteration): + next(alignments) + def test_pair_aln_full_blank_line(self): path = "Emboss/emboss_pair_aln_full_blank_line.txt" alignments = Align.parse(path, "emboss")