Support EMBOSS needle/water alignments to the reverse strand (#4142)

* update

* update

* update

* update

* update

* update

* add tests
This commit is contained in:
mdehoon
2022-10-24 01:24:31 +09:00
committed by GitHub
parent 23aacea3e0
commit b308ffe5f2
6 changed files with 432 additions and 15 deletions

View File

@ -11,7 +11,7 @@ for example from the needle, water, and stretcher tools.
""" """
from Bio.Align import Alignment from Bio.Align import Alignment
from Bio.Align import interfaces from Bio.Align import interfaces
from Bio.Seq import Seq from Bio.Seq import Seq, reverse_complement
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
@ -99,6 +99,7 @@ class AlignmentIterator(interfaces.AlignmentIterator):
aligned_sequences = [""] * number_of_sequences aligned_sequences = [""] * number_of_sequences
consensus = "" consensus = ""
starts = [0] * number_of_sequences starts = [0] * number_of_sequences
ends = [0] * number_of_sequences
column = 0 column = 0
index = 0 index = 0
continue continue
@ -171,14 +172,19 @@ class AlignmentIterator(interfaces.AlignmentIterator):
n = len(sequences) n = len(sequences)
for i in range(n): for i in range(n):
start = starts[i] start = starts[i]
if start == 0: end = ends[i]
sequence = Seq(sequences[i]) if start < end:
else:
coordinates[i, :] += start coordinates[i, :] += start
data = sequences[i]
else:
start, end = end, start
coordinates[i, :] = end - coordinates[i, :]
data = reverse_complement(sequences[i])
if start == 0:
sequence = Seq(data)
else:
# create a partially defined sequence # create a partially defined sequence
length = start + len(sequences[i]) sequence = Seq({start: data}, length=end)
data = {start: sequences[i]}
sequence = Seq(data, length=length)
record = SeqRecord(sequence, identifiers[i]) record = SeqRecord(sequence, identifiers[i])
records.append(record) records.append(record)
alignment = Alignment(records, coordinates) alignment = Alignment(records, coordinates)
@ -198,21 +204,43 @@ class AlignmentIterator(interfaces.AlignmentIterator):
identifier, start = prefix.split(None, 1) identifier, start = prefix.split(None, 1)
assert identifiers[index].startswith(identifier) assert identifiers[index].startswith(identifier)
aligned_sequence, end = line[21:].split(None, 1) aligned_sequence, end = line[21:].split(None, 1)
start = int(start) - 1 # Python counting start = int(start)
end = int(end) end = int(end)
length = len(sequences[index]) length = len(sequences[index])
sequence = aligned_sequence.replace("-", "") sequence = aligned_sequence.replace("-", "")
if length == 0 and len(sequence) > 0: if length == 0 and len(sequence) > 0:
if start < end:
start -= 1 # Python counting
assert end == start + len(sequence)
else:
end -= 1 # Python counting
assert end == start - len(sequence)
# Record the start # Record the start
starts[index] = start starts[index] = start
else: else:
if ( if starts[index] <= ends[index]:
self.metadata["Align_format"] == "srspair" # forward strand
and len(sequence) == 0 if (
): self.metadata["Align_format"] == "srspair"
start += 1 and len(sequence) == 0
assert start == starts[index] + length ):
assert end == start + len(sequence) assert start == ends[index]
assert end == start
else:
start -= 1
assert end == start + len(sequence)
else:
if (
self.metadata["Align_format"] == "srspair"
and len(sequence) == 0
):
assert start - 1 == ends[index]
assert end == start
else:
end -= 1
assert end == start - len(sequence)
# Record the end
ends[index] = end
sequences[index] += sequence sequences[index] += sequence
aligned_sequences[index] += aligned_sequence aligned_sequences[index] += aligned_sequence
if index == 0: if index == 0:

View File

@ -0,0 +1,47 @@
########################################
# Program: water
# Rundate: Sat 22 Oct 2022 23:47:41
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 0.001
# -gapextend 0.001
# -sreverse1
# -outfile water_reverse1.txt
# Align_format: srspair
# Report_file: water_reverse1.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 0.001
# Extend_penalty: 0.001
#
# Length: 121
# Identity: 32/121 (26.4%)
# Similarity: 32/121 (26.4%)
# Gaps: 89/121 (73.6%)
# Score: 159.911
#
#
#=======================================
seqA 121 GGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 72
|||||||||||||||||||
seqB 1 GGGGGGGGGGGGGGGGGGG------------------------------- 19
seqA 71 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 22
seqB 19 -------------------------------------------------- 19
seqA 21 TTTTTTTTCCCCCCCCCCCCC 1
|||||||||||||
seqB 20 --------CCCCCCCCCCCCC 32
#---------------------------------------
#---------------------------------------

View File

@ -0,0 +1,47 @@
########################################
# Program: water
# Rundate: Sun 23 Oct 2022 00:06:18
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 0.001
# -gapextend 0.001
# -sreverse2
# -outfile water_reverse2.txt
# Align_format: srspair
# Report_file: water_reverse2.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 0.001
# Extend_penalty: 0.001
#
# Length: 121
# Identity: 32/121 (26.4%)
# Similarity: 32/121 (26.4%)
# Gaps: 89/121 (73.6%)
# Score: 159.911
#
#
#=======================================
seqA 1 GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 50
|||||||||||||
seqB 32 GGGGGGGGGGGGG------------------------------------- 20
seqA 51 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 100
seqB 20 -------------------------------------------------- 20
seqA 101 AACCCCCCCCCCCCCCCCCCC 121
|||||||||||||||||||
seqB 19 --CCCCCCCCCCCCCCCCCCC 1
#---------------------------------------
#---------------------------------------

View File

@ -0,0 +1,39 @@
########################################
# Program: water
# Rundate: Sat 22 Oct 2022 22:56:03
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 1
# -gapextend 0.5
# -sreverse1
# -outfile water_reverse3.txt
# Align_format: srspair
# Report_file: water_reverse3.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 1.0
# Extend_penalty: 0.5
#
# Length: 19
# Identity: 16/19 (84.2%)
# Similarity: 16/19 (84.2%)
# Gaps: 3/19 (15.8%)
# Score: 77.5
#
#
#=======================================
seqA 20 TTTTTTTAAA-CCGGGCCC 3
||||||| | ||||||||
seqB 3 TTTTTTT--ACCCGGGCCC 19
#---------------------------------------
#---------------------------------------

View File

@ -0,0 +1,39 @@
########################################
# Program: water
# Rundate: Sat 22 Oct 2022 22:56:15
# Commandline: water
# -asequence seqA.fa
# -bsequence seqB.fa
# -gapopen 1
# -gapextend 0.5
# -sreverse2
# -outfile water_reverse4.txt
# Align_format: srspair
# Report_file: water_reverse4.txt
########################################
#=======================================
#
# Aligned_sequences: 2
# 1: seqA
# 2: seqB
# Matrix: EDNAFULL
# Gap_penalty: 1.0
# Extend_penalty: 0.5
#
# Length: 19
# Identity: 16/19 (84.2%)
# Similarity: 16/19 (84.2%)
# Gaps: 3/19 (15.8%)
# Score: 77.5
#
#
#=======================================
seqA 3 GGGCCCGGTT-TAAAAAAA 20
|||||||| ||||||||
seqB 19 GGGCCCGG--GTAAAAAAA 3
#---------------------------------------
#---------------------------------------

View File

@ -1214,6 +1214,223 @@ numpy.array([['K', 'I', 'L', 'I', 'V', 'D', 'D', 'Q', 'Y', 'G', 'I', 'R', 'I',
with self.assertRaises(StopIteration): with self.assertRaises(StopIteration):
next(alignments) next(alignments)
def test_water_reverse1(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse1 -outfile water_reverse1.txt
path = "Emboss/water_reverse1.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 23:47:41")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 0.001 -gapextend 0.001 -sreverse1 -outfile water_reverse1.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse1.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 0.001)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.001)
self.assertEqual(alignment.annotations["Identity"], 32)
self.assertEqual(alignment.annotations["Similarity"], 32)
self.assertEqual(alignment.annotations["Gaps"], 89)
self.assertAlmostEqual(alignment.annotations["Score"], 159.911)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 121))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
alignment.sequences[0].seq,
"GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment.sequences[1].seq,
"GGGGGGGGGGGGGGGGGGGCCCCCCCCCCCCC",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
# flake8: noqa
numpy.array([[121, 102, 13, 0],
[ 0, 19, 19, 32]])
# fmt: on
)
)
self.assertEqual(
alignment[0],
"GGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCC",
)
self.assertEqual(
alignment[1],
"GGGGGGGGGGGGGGGGGGG-----------------------------------------------------------------------------------------CCCCCCCCCCCCC",
)
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"||||||||||||||||||| |||||||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_water_reverse2(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse2 -outfile water_reverse2.txt
path = "Emboss/water_reverse2.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sun 23 Oct 2022 00:06:18")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 0.001 -gapextend 0.001 -sreverse2 -outfile water_reverse2.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse2.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 0.001)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.001)
self.assertEqual(alignment.annotations["Identity"], 32)
self.assertEqual(alignment.annotations["Similarity"], 32)
self.assertEqual(alignment.annotations["Gaps"], 89)
self.assertAlmostEqual(alignment.annotations["Score"], 159.911)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 121))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
alignment.sequences[0].seq,
"GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment.sequences[1].seq,
"GGGGGGGGGGGGGGGGGGGCCCCCCCCCCCCC",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
numpy.array([[ 0, 13, 102, 121],
[32, 19, 19, 0]])
# fmt: on
)
)
self.assertEqual(
alignment[0],
"GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment[1],
"GGGGGGGGGGGGG-----------------------------------------------------------------------------------------CCCCCCCCCCCCCCCCCCC",
)
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"||||||||||||| |||||||||||||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_water_reverse3(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse1 -outfile water_reverse3.txt
path = "Emboss/water_reverse3.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 22:56:03")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 1 -gapextend 0.5 -sreverse1 -outfile water_reverse3.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse3.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 1.0)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.5)
self.assertEqual(alignment.annotations["Identity"], 16)
self.assertEqual(alignment.annotations["Similarity"], 16)
self.assertEqual(alignment.annotations["Gaps"], 3)
self.assertAlmostEqual(alignment.annotations["Score"], 77.5)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 19))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
repr(alignment.sequences[0].seq),
"Seq({2: 'GGGCCCGGTTTAAAAAAA'}, length=20)",
)
self.assertEqual(
repr(alignment.sequences[1].seq),
"Seq({2: 'TTTTTTTACCCGGGCCC'}, length=19)",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
# flake8: noqa
numpy.array([[20, 13, 11, 10, 10, 2],
[ 2, 9, 9, 10, 11, 19]])
# fmt: on
)
)
self.assertEqual(alignment[0], "TTTTTTTAAA-CCGGGCCC")
self.assertEqual(alignment[1], "TTTTTTT--ACCCGGGCCC")
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"||||||| | ||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_water_reverse4(self):
# water -asequence seqA.fa -bsequence seqB.fa -gapopen 10 -gapextend 0.5 -sreverse2 -outfile water_reverse4.txt
path = "Emboss/water_reverse4.txt"
alignments = Align.parse(path, "emboss")
self.assertEqual(alignments.metadata["Program"], "water")
self.assertEqual(alignments.metadata["Rundate"], "Sat 22 Oct 2022 22:56:15")
self.assertEqual(
alignments.metadata["Command line"],
"water -asequence seqA.fa -bsequence seqB.fa -gapopen 1 -gapextend 0.5 -sreverse2 -outfile water_reverse4.txt",
)
self.assertEqual(alignments.metadata["Align_format"], "srspair")
self.assertEqual(alignments.metadata["Report_file"], "water_reverse4.txt")
alignment = next(alignments)
self.assertEqual(alignment.annotations["Matrix"], "EDNAFULL")
self.assertAlmostEqual(alignment.annotations["Gap_penalty"], 1.0)
self.assertAlmostEqual(alignment.annotations["Extend_penalty"], 0.5)
self.assertEqual(alignment.annotations["Identity"], 16)
self.assertEqual(alignment.annotations["Similarity"], 16)
self.assertEqual(alignment.annotations["Gaps"], 3)
self.assertAlmostEqual(alignment.annotations["Score"], 77.5)
self.assertEqual(len(alignment), 2)
self.assertEqual(alignment.shape, (2, 19))
self.assertEqual(alignment.sequences[0].id, "seqA")
self.assertEqual(alignment.sequences[1].id, "seqB")
self.assertEqual(
repr(alignment.sequences[0].seq),
"Seq({2: 'GGGCCCGGTTTAAAAAAA'}, length=20)",
)
self.assertEqual(
repr(alignment.sequences[1].seq),
"Seq({2: 'TTTTTTTACCCGGGCCC'}, length=19)",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
# fmt: off
# flake8: noqa
numpy.array([[ 2, 10, 12, 12, 20],
[19, 11, 11, 10, 2]])
# fmt: on
)
)
self.assertEqual(alignment[0], "GGGCCCGGTT-TAAAAAAA")
self.assertEqual(alignment[1], "GGGCCCGG--GTAAAAAAA")
self.assertEqual(
alignment.column_annotations["emboss_consensus"],
"|||||||| ||||||||",
)
with self.assertRaises(StopIteration):
next(alignments)
def test_pair_aln_full_blank_line(self): def test_pair_aln_full_blank_line(self):
path = "Emboss/emboss_pair_aln_full_blank_line.txt" path = "Emboss/emboss_pair_aln_full_blank_line.txt"
alignments = Align.parse(path, "emboss") alignments = Align.parse(path, "emboss")