Support EMBOSS needle/water alignments to the reverse strand (#4142)

* update

* update

* update

* update

* update

* update

* add tests
This commit is contained in:
mdehoon
2022-10-24 01:24:31 +09:00
committed by GitHub
parent 23aacea3e0
commit b308ffe5f2
6 changed files with 432 additions and 15 deletions

View File

@ -11,7 +11,7 @@ for example from the needle, water, and stretcher tools.
"""
from Bio.Align import Alignment
from Bio.Align import interfaces
from Bio.Seq import Seq
from Bio.Seq import Seq, reverse_complement
from Bio.SeqRecord import SeqRecord
@ -99,6 +99,7 @@ class AlignmentIterator(interfaces.AlignmentIterator):
aligned_sequences = [""] * number_of_sequences
consensus = ""
starts = [0] * number_of_sequences
ends = [0] * number_of_sequences
column = 0
index = 0
continue
@ -171,14 +172,19 @@ class AlignmentIterator(interfaces.AlignmentIterator):
n = len(sequences)
for i in range(n):
start = starts[i]
if start == 0:
sequence = Seq(sequences[i])
else:
end = ends[i]
if start < end:
coordinates[i, :] += start
data = sequences[i]
else:
start, end = end, start
coordinates[i, :] = end - coordinates[i, :]
data = reverse_complement(sequences[i])
if start == 0:
sequence = Seq(data)
else:
# create a partially defined sequence
length = start + len(sequences[i])
data = {start: sequences[i]}
sequence = Seq(data, length=length)
sequence = Seq({start: data}, length=end)
record = SeqRecord(sequence, identifiers[i])
records.append(record)
alignment = Alignment(records, coordinates)
@ -198,21 +204,43 @@ class AlignmentIterator(interfaces.AlignmentIterator):
identifier, start = prefix.split(None, 1)
assert identifiers[index].startswith(identifier)
aligned_sequence, end = line[21:].split(None, 1)
start = int(start) - 1 # Python counting
start = int(start)
end = int(end)
length = len(sequences[index])
sequence = aligned_sequence.replace("-", "")
if length == 0 and len(sequence) > 0:
if start < end:
start -= 1 # Python counting
assert end == start + len(sequence)
else:
end -= 1 # Python counting
assert end == start - len(sequence)
# Record the start
starts[index] = start
else:
if starts[index] <= ends[index]:
# forward strand
if (
self.metadata["Align_format"] == "srspair"
and len(sequence) == 0
):
assert start == ends[index]
assert end == start
else:
start -= 1
assert end == start + len(sequence)
else:
if (
self.metadata["Align_format"] == "srspair"
and len(sequence) == 0
):
start += 1
assert start == starts[index] + length
assert end == start + len(sequence)
assert start - 1 == ends[index]
assert end == start
else:
end -= 1
assert end == start - len(sequence)
# Record the end
ends[index] = end
sequences[index] += sequence
aligned_sequences[index] += aligned_sequence
if index == 0:

View File

@ -0,0 +1,47 @@
########################################
# Program: water
# Rundate: Sat 22 Oct 2022 23:47:41
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 0.001
# -gapextend 0.001
# -sreverse1
# -outfile water_reverse1.txt
# Align_format: srspair
# Report_file: water_reverse1.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 0.001
# Extend_penalty: 0.001
#
# Length: 121
# Identity: 32/121 (26.4%)
# Similarity: 32/121 (26.4%)
# Gaps: 89/121 (73.6%)
# Score: 159.911
#
#
#=======================================
seqA 121 GGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 72
|||||||||||||||||||
seqB 1 GGGGGGGGGGGGGGGGGGG------------------------------- 19
seqA 71 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 22
seqB 19 -------------------------------------------------- 19
seqA 21 TTTTTTTTCCCCCCCCCCCCC 1
|||||||||||||
seqB 20 --------CCCCCCCCCCCCC 32
#---------------------------------------
#---------------------------------------

View File

@ -0,0 +1,47 @@
########################################
# Program: water
# Rundate: Sun 23 Oct 2022 00:06:18
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 0.001
# -gapextend 0.001
# -sreverse2
# -outfile water_reverse2.txt
# Align_format: srspair
# Report_file: water_reverse2.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 0.001
# Extend_penalty: 0.001
#
# Length: 121
# Identity: 32/121 (26.4%)
# Similarity: 32/121 (26.4%)
# Gaps: 89/121 (73.6%)
# Score: 159.911
#
#
#=======================================
seqA 1 GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 50
|||||||||||||
seqB 32 GGGGGGGGGGGGG------------------------------------- 20
seqA 51 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 100
seqB 20 -------------------------------------------------- 20
seqA 101 AACCCCCCCCCCCCCCCCCCC 121
|||||||||||||||||||
seqB 19 --CCCCCCCCCCCCCCCCCCC 1
#---------------------------------------
#---------------------------------------

View File

@ -0,0 +1,39 @@
########################################
# Program: water
# Rundate: Sat 22 Oct 2022 22:56:03
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 1
# -gapextend 0.5
# -sreverse1
# -outfile water_reverse3.txt
# Align_format: srspair
# Report_file: water_reverse3.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 1.0
# Extend_penalty: 0.5
#
# Length: 19
# Identity: 16/19 (84.2%)
# Similarity: 16/19 (84.2%)
# Gaps: 3/19 (15.8%)
# Score: 77.5
#
#
#=======================================
seqA 20 TTTTTTTAAA-CCGGGCCC 3
||||||| | ||||||||
seqB 3 TTTTTTT--ACCCGGGCCC 19
#---------------------------------------
#---------------------------------------

View File

@ -0,0 +1,39 @@
########################################
# Program: water
# Rundate: Sat 22 Oct 2022 22:56:15
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 1
# -gapextend 0.5
# -sreverse2
# -outfile water_reverse4.txt
# Align_format: srspair
# Report_file: water_reverse4.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 1.0
# Extend_penalty: 0.5
#
# Length: 19
# Identity: 16/19 (84.2%)
# Similarity: 16/19 (84.2%)
# Gaps: 3/19 (15.8%)
# Score: 77.5
#
#
#=======================================
seqA 3 GGGCCCGGTT-TAAAAAAA 20
|||||||| ||||||||
seqB 19 GGGCCCGG--GTAAAAAAA 3
#---------------------------------------
#---------------------------------------

View File

@ -1214,6 +1214,223 @@ numpy.array([['K', 'I', 'L', 'I', 'V', 'D', 'D', 'Q', 'Y', 'G', 'I', 'R', 'I',
with self.assertRaises(StopIteration):
next(alignments)
def test_water_reverse1(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse1 -outfile water_reverse1.txt
path = "Emboss/water_reverse1.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 23:47:41")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 0.001 -gapextend 0.001 -sreverse1 -outfile water_reverse1.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse1.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 0.001)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.001)
self.assertEqual(alignment.annotations["Identity"], 32)
self.assertEqual(alignment.annotations["Similarity"], 32)
self.assertEqual(alignment.annotations["Gaps"], 89)
self.assertAlmostEqual(alignment.annotations["Score"], 159.911)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 121))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
alignment.sequences[0].seq,
"GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment.sequences[1].seq,
"GGGGGGGGGGGGGGGGGGGCCCCCCCCCCCCC",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
# flake8: noqa
numpy.array([[121, 102, 13, 0],
[ 0, 19, 19, 32]])
# fmt: on
)
)
self.assertEqual(
alignment[0],
"GGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCC",
)
self.assertEqual(
alignment[1],
"GGGGGGGGGGGGGGGGGGG-----------------------------------------------------------------------------------------CCCCCCCCCCCCC",
)
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"||||||||||||||||||| |||||||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_water_reverse2(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse2 -outfile water_reverse2.txt
path = "Emboss/water_reverse2.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sun 23 Oct 2022 00:06:18")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 0.001 -gapextend 0.001 -sreverse2 -outfile water_reverse2.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse2.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 0.001)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.001)
self.assertEqual(alignment.annotations["Identity"], 32)
self.assertEqual(alignment.annotations["Similarity"], 32)
self.assertEqual(alignment.annotations["Gaps"], 89)
self.assertAlmostEqual(alignment.annotations["Score"], 159.911)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 121))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
alignment.sequences[0].seq,
"GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment.sequences[1].seq,
"GGGGGGGGGGGGGGGGGGGCCCCCCCCCCCCC",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
numpy.array([[ 0, 13, 102, 121],
[32, 19, 19, 0]])
# fmt: on
)
)
self.assertEqual(
alignment[0],
"GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment[1],
"GGGGGGGGGGGGG-----------------------------------------------------------------------------------------CCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"||||||||||||| |||||||||||||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_water_reverse3(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse1 -outfile water_reverse3.txt
path = "Emboss/water_reverse3.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 22:56:03")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 1 -gapextend 0.5 -sreverse1 -outfile water_reverse3.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse3.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 1.0)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.5)
self.assertEqual(alignment.annotations["Identity"], 16)
self.assertEqual(alignment.annotations["Similarity"], 16)
self.assertEqual(alignment.annotations["Gaps"], 3)
self.assertAlmostEqual(alignment.annotations["Score"], 77.5)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 19))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
repr(alignment.sequences[0].seq),
"Seq({2: 'GGGCCCGGTTTAAAAAAA'}, length=20)",
)
self.assertEqual(
repr(alignment.sequences[1].seq),
"Seq({2: 'TTTTTTTACCCGGGCCC'}, length=19)",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
# flake8: noqa
numpy.array([[20, 13, 11, 10, 10, 2],
[ 2, 9, 9, 10, 11, 19]])
# fmt: on
)
)
self.assertEqual(alignment[0], "TTTTTTTAAA-CCGGGCCC")
self.assertEqual(alignment[1], "TTTTTTT--ACCCGGGCCC")
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"||||||| | ||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_water_reverse4(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse2 -outfile water_reverse4.txt
path = "Emboss/water_reverse4.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 22:56:15")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 1 -gapextend 0.5 -sreverse2 -outfile water_reverse4.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse4.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 1.0)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.5)
self.assertEqual(alignment.annotations["Identity"], 16)
self.assertEqual(alignment.annotations["Similarity"], 16)
self.assertEqual(alignment.annotations["Gaps"], 3)
self.assertAlmostEqual(alignment.annotations["Score"], 77.5)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 19))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
repr(alignment.sequences[0].seq),
"Seq({2: 'GGGCCCGGTTTAAAAAAA'}, length=20)",
)
self.assertEqual(
repr(alignment.sequences[1].seq),
"Seq({2: 'TTTTTTTACCCGGGCCC'}, length=19)",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
# flake8: noqa
numpy.array([[ 2, 10, 12, 12, 20],
[19, 11, 11, 10, 2]])
# fmt: on
)
)
self.assertEqual(alignment[0], "GGGCCCGGTT-TAAAAAAA")
self.assertEqual(alignment[1], "GGGCCCGG--GTAAAAAAA")
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"|||||||| ||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_pair_aln_full_blank_line(self):
path = "Emboss/emboss_pair_aln_full_blank_line.txt"
alignments = Align.parse(path, "emboss")