mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
999 lines
41 KiB
Python
999 lines
41 KiB
Python
# This code is part of the Biopython distribution and governed by its
|
|
# license. Please see the LICENSE file that should have been included
|
|
# as part of this package.
|
|
|
|
|
|
"""Tests for mapping pairwise alignments."""
|
|
|
|
import os
|
|
import random
|
|
import unittest
|
|
|
|
try:
|
|
import numpy as np
|
|
except ImportError:
|
|
from Bio import MissingPythonDependencyError
|
|
|
|
raise MissingPythonDependencyError(
|
|
"Install numpy if you want to use Bio.Align.Alignment.map."
|
|
) from None
|
|
|
|
from Bio import Align
|
|
from Bio import SeqIO
|
|
from Bio.Align import Alignment
|
|
from Bio.Align import PairwiseAligner
|
|
from Bio.Seq import Seq
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
|
|
class TestSimple(unittest.TestCase):
|
|
aligner = PairwiseAligner()
|
|
aligner.internal_open_gap_score = -1
|
|
aligner.internal_extend_gap_score = -0.0
|
|
aligner.match_score = +1
|
|
aligner.mismatch_score = -1
|
|
aligner.mode = "local"
|
|
|
|
def test_internal(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA")
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq("GGGGGGGCCCCCGGGGGGA")
|
|
transcript.id = "transcript"
|
|
sequence = Seq("GGCCCCCGGG")
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript)
|
|
self.assertEqual(len(alignments1), 1)
|
|
alignment1 = alignments1[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment1.coordinates, np.array([[12, 31], [0, 19]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 12 GGGGGGGCCCCCGGGGGGA 31
|
|
0 ||||||||||||||||||| 19
|
|
transcrip 0 GGGGGGGCCCCCGGGGGGA 19
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence)
|
|
self.assertEqual(len(alignments2), 1)
|
|
alignment2 = alignments2[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment2.coordinates, np.array([[5, 15], [0, 10]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 5 GGCCCCCGGG 15
|
|
0 |||||||||| 10
|
|
sequence 0 GGCCCCCGGG 10
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates, np.array([[17, 27], [0, 10]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 17 GGCCCCCGGG 27
|
|
0 |||||||||| 10
|
|
sequence 0 GGCCCCCGGG 10
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
10 0 0 0 0 0 0 0 + sequence 10 0 10 chromosome 40 17 27 1 10, 0, 17,
|
|
""",
|
|
)
|
|
|
|
def test_left_overhang(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq("GGGCCCCCGGGGGGAAAAAAAAAA")
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq("AGGGGGCCCCCGGGGGGA")
|
|
transcript.id = "transcript"
|
|
sequence = Seq("GGGGGCCCCCGGG")
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript)
|
|
self.assertEqual(len(alignments1), 1)
|
|
alignment1 = alignments1[0]
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 0 GGGCCCCCGGGGGGA 15
|
|
0 ||||||||||||||| 15
|
|
transcrip 3 GGGCCCCCGGGGGGA 18
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence)
|
|
self.assertEqual(len(alignments2), 1)
|
|
alignment2 = alignments2[0]
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 1 GGGGGCCCCCGGG 14
|
|
0 ||||||||||||| 13
|
|
sequence 0 GGGGGCCCCCGGG 13
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates, np.array([[0, 11], [2, 13]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 0 GGGCCCCCGGG 11
|
|
0 ||||||||||| 11
|
|
sequence 2 GGGCCCCCGGG 13
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
11 0 0 0 0 0 0 0 + sequence 13 2 13 chromosome 24 0 11 1 11, 2, 0,
|
|
""",
|
|
)
|
|
|
|
def test_right_overhang(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGG")
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq("GGGGGGGCCCCCGGGGGGA")
|
|
transcript.id = "transcript"
|
|
sequence = Seq("GGCCCCCGGGGG")
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript)
|
|
self.assertEqual(len(alignments1), 1)
|
|
alignment1 = alignments1[0]
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 12 GGGGGGGCCCCCGGG 27
|
|
0 ||||||||||||||| 15
|
|
transcrip 0 GGGGGGGCCCCCGGG 15
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence)
|
|
self.assertEqual(len(alignments2), 1)
|
|
alignment2 = alignments2[0]
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 5 GGCCCCCGGGGG 17
|
|
0 |||||||||||| 12
|
|
sequence 0 GGCCCCCGGGGG 12
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates, np.array([[17, 27], [0, 10]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 17 GGCCCCCGGG 27
|
|
0 |||||||||| 10
|
|
sequence 0 GGCCCCCGGG 10
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
10 0 0 0 0 0 0 0 + sequence 12 0 10 chromosome 27 17 27 1 10, 0, 17,
|
|
""",
|
|
)
|
|
|
|
def test_reverse_transcript(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA")
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq("TCCCCCCGGGGGCCCCCCC")
|
|
transcript.id = "transcript"
|
|
sequence = Seq("GGCCCCCGGG")
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript, strand="-")
|
|
self.assertEqual(len(alignments1), 1)
|
|
alignment1 = alignments1[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment1.coordinates, np.array([[12, 31], [19, 0]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 12 GGGGGGGCCCCCGGGGGGA 31
|
|
0 ||||||||||||||||||| 19
|
|
transcrip 19 GGGGGGGCCCCCGGGGGGA 0
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence, strand="-")
|
|
self.assertEqual(len(alignments2), 1)
|
|
alignment2 = alignments2[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment2.coordinates, np.array([[4, 14], [10, 0]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 4 CCCGGGGGCC 14
|
|
0 |||||||||| 10
|
|
sequence 10 CCCGGGGGCC 0
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates, np.array([[17, 27], [0, 10]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 17 GGCCCCCGGG 27
|
|
0 |||||||||| 10
|
|
sequence 0 GGCCCCCGGG 10
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
10 0 0 0 0 0 0 0 + sequence 10 0 10 chromosome 40 17 27 1 10, 0, 17,
|
|
""",
|
|
)
|
|
|
|
def test_reverse_sequence(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA")
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq("GGGGGGGCCCCCGGGGGGA")
|
|
transcript.id = "transcript"
|
|
sequence = Seq("CCCGGGGGCC")
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript)
|
|
self.assertEqual(len(alignments1), 1)
|
|
alignment1 = alignments1[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment1.coordinates, np.array([[12, 31], [0, 19]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 12 GGGGGGGCCCCCGGGGGGA 31
|
|
0 ||||||||||||||||||| 19
|
|
transcrip 0 GGGGGGGCCCCCGGGGGGA 19
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence, "-")
|
|
self.assertEqual(len(alignments2), 1)
|
|
alignment2 = alignments2[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment2.coordinates, np.array([[5, 15], [10, 0]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 5 GGCCCCCGGG 15
|
|
0 |||||||||| 10
|
|
sequence 10 GGCCCCCGGG 0
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates, np.array([[17, 27], [10, 0]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 17 GGCCCCCGGG 27
|
|
0 |||||||||| 10
|
|
sequence 10 GGCCCCCGGG 0
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
10 0 0 0 0 0 0 0 - sequence 10 0 10 chromosome 40 17 27 1 10, 0, 17,
|
|
""",
|
|
)
|
|
|
|
def test_reverse_transcript_sequence(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq("AAAAAAAAAAAAGGGGGGGCCCCCGGGGGGAAAAAAAAAA")
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq("TCCCCCCGGGGGCCCCCCC")
|
|
transcript.id = "transcript"
|
|
sequence = Seq("CCCGGGGGCC")
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript, "-")
|
|
self.assertEqual(len(alignments1), 1)
|
|
alignment1 = alignments1[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment1.coordinates, np.array([[12, 31], [19, 0]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 12 GGGGGGGCCCCCGGGGGGA 31
|
|
0 ||||||||||||||||||| 19
|
|
transcrip 19 GGGGGGGCCCCCGGGGGGA 0
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence)
|
|
self.assertEqual(len(alignments2), 1)
|
|
alignment2 = alignments2[0]
|
|
self.assertTrue(
|
|
np.array_equal(alignment2.coordinates, np.array([[4, 14], [0, 10]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 4 CCCGGGGGCC 14
|
|
0 |||||||||| 10
|
|
sequence 0 CCCGGGGGCC 10
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates, np.array([[17, 27], [10, 0]]))
|
|
)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 17 GGCCCCCGGG 27
|
|
0 |||||||||| 10
|
|
sequence 10 GGCCCCCGGG 0
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
10 0 0 0 0 0 0 0 - sequence 10 0 10 chromosome 40 17 27 1 10, 0, 17,
|
|
""",
|
|
)
|
|
|
|
|
|
class TestComplex(unittest.TestCase):
|
|
aligner = PairwiseAligner()
|
|
aligner.internal_open_gap_score = -1
|
|
aligner.internal_extend_gap_score = -0.0
|
|
aligner.match_score = +1
|
|
aligner.mismatch_score = -1
|
|
aligner.mode = "local"
|
|
|
|
def test1(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq(
|
|
"GCCTACCGTATAACAATGGTTATAATACAAGGCGGTCATAATTAAAGGGAGTGCAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTTTTTACTACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGGACAGCGGGAAGATGTCGTGGTTTCCTTGTCATTAACCAACTCCATCTTAAAAGCTCCTCTAGCCATGGCATGGTACGTTGCGCGCACCCTTTTATCGGTAAGGCGCGGTGACTCTCTCCCAAAACAGTGCCATAATGGTTCGCTTCCTACCTAAGGCACTTACGGCCAATTAATGCGCAAGCGAGCGGAAGGTCTAACAGGGCACCGAATTCGATTA"
|
|
)
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq(
|
|
"GGAATTTTAGCAGCCAAAGGACGGATCCTCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGAACTAAGCGGGAGCGCATGTGGGACAGTTGATCCCATCCGCCTCAAAATTTCTCGCAATATCGGTTGGGGCACAGGTCCACTTTACGAATTCATACCGTGGTAGAGACCTTTATTAGATAGATATGACTGTTTGATTGCGGCATAGTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGACCGGGTTGGGTCGAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTC"
|
|
)
|
|
transcript.id = "transcript"
|
|
sequence = Seq(
|
|
"TCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGGGGACAGTTGATCCCATCCGCCTTTTACGAATTCATACCGTGGTAGGCGGCATAGTACGACGAAGCGGTTGGGTCGAAAAACAGGTTGCCGTCATATCGGTGGGTTC"
|
|
)
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript)
|
|
alignment1 = alignments1[0]
|
|
self.assertEqual(alignment1.coordinates.shape[1], 164)
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 14 AATGGTTATA------ATACAAGG-CGG----TCATAATTAAAGGGAGTG---CAGCAAC
|
|
0 |||--||-||------|---||||-|||----||------.|||||---|---|||||--
|
|
transcrip 2 AAT--TT-TAGCAGCCA---AAGGACGGATCCTC------CAAGGG---GCCCCAGCA--
|
|
|
|
chromosom 60 GGCCTGCTCTCCAAAAAAACAGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGC---
|
|
60 ---|.||-----------|||--||||-|------||.|.|----||||----||||---
|
|
transcrip 45 ---CAGC-----------ACA--TTTT-T------AACGCG----AACT----AAGCGGG
|
|
|
|
chromosom 117 --CGTCATATCGGTGG----GTTCTGCCAGTCACCGGCATACGTCCTGGGACAAAGACTT
|
|
120 --||-|||----||||----||--||--|-|--||--|||.||-|||----||||-|---
|
|
transcrip 74 AGCG-CAT----GTGGGACAGT--TG--A-T--CC--CATCCG-CCT----CAAA-A---
|
|
|
|
chromosom 171 TTTACT-ACAATGCCAGGCGGGAGAGTCACCCGCCGCGGTGTCGACCCAGGGG-ACAGCG
|
|
180 |||-||-.||||------------|-|---------||||-|-------||||-||||--
|
|
transcrip 111 TTT-CTCGCAAT------------A-T---------CGGT-T-------GGGGCACAG--
|
|
|
|
chromosom 229 GGAAGATGTCGTGGTTTC-CTT---G---TCATTAACC-------A-ACTCCATCTTA--
|
|
240 -------|||-------|-|||---|---||||--|||-------|-|--||-|-|||--
|
|
transcrip 138 -------GTC-------CACTTTACGAATTCAT--ACCGTGGTAGAGA--CC-T-TTATT
|
|
|
|
chromosom 272 AAAGCTCCTCTAGCCATGGCATG---GT---ACGTTGCGCGCACCCTTTTA-T----CG-
|
|
300 |.|-------|||--||---|||---||---|--|||||-|||------||-|----||-
|
|
transcrip 178 AGA-------TAG--AT---ATGACTGTTTGA--TTGCG-GCA------TAGTACGACGA
|
|
|
|
chromosom 320 -GTAAGG-------CG---CGGT-------GACTCTC--------TCCCAAAACAGTGCC
|
|
360 -|.||||-------||---||||-------|||---|--------||..||||||-----
|
|
transcrip 217 AGCAAGGGGATGGACGTTTCGGTTGCATTCGAC---CGGGTTGGGTCGAAAAACA-----
|
|
|
|
chromosom 354 ATAATGGTTCGCTTCCTACCT-------AAG-GCACTT-ACGGCCAATTAATGCGCAAGC
|
|
420 -----|||----||--||--|-------|||-|||-||-||.|----|||------||||
|
|
transcrip 269 -----GGT----TT--TA--TGAAAAGAAAGTGCA-TTAACTG----TTA------AAGC
|
|
|
|
chromosom 405 GAGCGGAAGGTC-TAACAG-GGCACCGAATTC 435
|
|
480 ---|-----|||-||.|.|-||----|--||| 512
|
|
transcrip 305 ---C-----GTCATATCGGTGG----G--TTC 323
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence)
|
|
alignment2 = alignments2[0]
|
|
self.assertEqual(alignment2.coordinates.shape[1], 12)
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 28 TCCAAGGGGCCCCAGCACAGCACATTTTTAACGCGAACTAAGCGGGAGCGCATGTGGGAC
|
|
0 |||||||||||||||||||||||||||||||||||--------------------|||||
|
|
sequence 0 TCCAAGGGGCCCCAGCACAGCACATTTTTAACGCG--------------------GGGAC
|
|
|
|
transcrip 88 AGTTGATCCCATCCGCCTCAAAATTTCTCGCAATATCGGTTGGGGCACAGGTCCACTTTA
|
|
60 ||||||||||||||||||--------------------------------------||||
|
|
sequence 40 AGTTGATCCCATCCGCCT--------------------------------------TTTA
|
|
|
|
transcrip 148 CGAATTCATACCGTGGTAGAGACCTTTATTAGATAGATATGACTGTTTGATTGCGGCATA
|
|
120 |||||||||||||||||||---------------------------------||||||||
|
|
sequence 62 CGAATTCATACCGTGGTAG---------------------------------GCGGCATA
|
|
|
|
transcrip 208 GTACGACGAAGCAAGGGGATGGACGTTTCGGTTGCATTCGACCGGGTTGGGTCGAAAAAC
|
|
180 ||||||||||||--------------------------------||||||||||||||||
|
|
sequence 89 GTACGACGAAGC--------------------------------GGTTGGGTCGAAAAAC
|
|
|
|
transcrip 268 AGGTTTTATGAAAAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGGGTTC 323
|
|
240 |||||------------------------------|||||||||||||||||||| 295
|
|
sequence 117 AGGTT------------------------------GCCGTCATATCGGTGGGTTC 142
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertEqual(alignment.coordinates.shape[1], 76)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 35 TCATAATTAAAGGGAGTG---CAGCAACGGCCTGCTCTCCAAAAAAACAGGTTTTATGAA
|
|
0 ||------.|||||---|---|||||-----|.||-----------|||--||||-|---
|
|
sequence 0 TC------CAAGGG---GCCCCAGCA-----CAGC-----------ACA--TTTT-T---
|
|
|
|
chromosom 92 AAGAAAGTGCATTAACTGTTAAAGCCGTCATATCGGTGG----GTTCTGCCAGTCACCGG
|
|
60 ---||.|.|----------------------------||----||--||--|-|--||--
|
|
sequence 29 ---AACGCG----------------------------GGGACAGT--TG--A-T--CC--
|
|
|
|
chromosom 148 CATACGTCCTGGGACAAAGACTTTTTACTACAATGCCAGGCGGGAGAGTCACCCGCCGCG
|
|
120 |||.||-|||--------------------------------------------------
|
|
sequence 49 CATCCG-CCT--------------------------------------------------
|
|
|
|
chromosom 208 GTGTCGACCCAGGGGACAGCGGGAAGATGTCGTGGTTTCCTT---G---TCATTAACCAA
|
|
180 ----------------------------------------||---|---||||--|||--
|
|
sequence 58 ----------------------------------------TTTACGAATTCAT--ACC--
|
|
|
|
chromosom 262 CTCCATCTTAAAAGCTCCTCTAGCCATGGCATGGTACGTT-------GCGCGCACCCTTT
|
|
240 -----------------------------------------------|||-|||------
|
|
sequence 74 ----------------------------------------GTGGTAGGCG-GCA------
|
|
|
|
chromosom 315 TA-T----CG--GTAAGGCGCGGTGACTCTC-------TCCCAAAACAGTGCCATAATGG
|
|
300 ||-|----||--|.------------------------||..||||||----------||
|
|
sequence 87 TAGTACGACGAAGC-----------------GGTTGGGTCGAAAAACA----------GG
|
|
|
|
chromosom 361 TTCGCTTCCTACCTAAGGCACTTACGGCCAATTAATGCGCAAGCGAGCGGAAGGTC-TAA
|
|
360 |----|------------------------------------||---|-----|||-||.
|
|
sequence 120 T----T------------------------------------GC---C-----GTCATAT
|
|
|
|
chromosom 420 CAG-GGCACCGAATTC 435
|
|
420 |.|-||----|--||| 436
|
|
sequence 132 CGGTGG----G--TTC 142
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
96 10 0 0 11 36 27 294 + sequence 142 0 142 chromosome 440 35 435 37 2,6,1,5,4,3,4,1,6,2,2,2,1,1,2,6,3,2,1,4,3,3,3,2,1,2,2,10,3,1,2,1,3,6,2,1,3, 0,2,8,12,17,21,24,28,29,35,41,43,45,46,47,49,55,58,63,67,71,81,84,87,90,95,99,108,118,121,122,124,125,129,136,138,139, 35,43,52,53,63,78,83,88,95,129,131,135,139,141,144,148,155,248,250,251,257,302,306,315,317,318,320,339,359,366,403,408,414,417,423,429,432,
|
|
""",
|
|
)
|
|
|
|
def test2(self):
|
|
aligner = self.aligner
|
|
chromosome = Seq(
|
|
"CTAATGCGCCTTGGTTTTGGCTTAACTAGAAGCAACCTGTAAGATTGCCAATTCTTCAGTCGAAGTAAATCTTCAATGTTTTGGACTCTTAGCGGATATGCGGCTGAGAAGTACGACATGTGTACATTCATACCTGCGTGACGGTCAGCCTCCCCCGGGACCTCATTGGGCGAATCTAGGTGTGATAATTGACACACTCTTGGTAAGAAGCACTCTTTACCCGATCTCCAAGTACCGACGCCAAGGCCAAGCTCTGCGATCTAAAGCTGCCGATCGTAGATCCAAGTCCTCAGCAAGCTCGCACGAATACGCAGTTCGAAGGCTGGGTGTTGTACGACGGTACGGTTGCTATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGACCAACTTGGAATAGTGTAACGCAGCGCTTGA"
|
|
)
|
|
chromosome.id = "chromosome"
|
|
transcript = Seq(
|
|
"CACCGGCGTCGGTACCAGAGGGCGTGAGTACCTTGTACTAGTACTCATTGGAATAATGCTCTTAGAAGTCATCTAAAAGTGACAACGCCTGTTTGGTTATGACGTTCACGACGCGTCTTAACAGACTAGCATTAGACCGACGGGTTGAGGCGTCTGGGTTGATACAGCCGTTTGCATCAGTGTATCTAACACTCTGAGGGATAATTGATGAACCGTGTTTTCCGATAGGTATGTACAGTACCACCACGCACGACTAAGGACCATTTTCTGCGTGCGACGGTTAAAATAACCTCAATCACT"
|
|
)
|
|
transcript.id = "transcript"
|
|
sequence = Seq(
|
|
"TCCCCTTCTAATGGAATCCCCCTCCGAAGGTCGCAGAAGCGGCCACGCCGGAGATACCAGTTCCACGCCTCAGGTTGGACTTGTCACACTTGTACGCGAT"
|
|
)
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript)
|
|
alignment1 = alignments1[0]
|
|
self.assertEqual(alignment1.coordinates.shape[1], 126)
|
|
self.assertEqual(
|
|
str(alignment1),
|
|
"""\
|
|
chromosom 5 GCGCCTTGGTTTTGGCTTAACTAGA-------AGCAACC-TGTAAGATTGCCAATTCTTC
|
|
0 |||--|.|||---------||.|||-------||-.|||-||||------------||--
|
|
transcrip 5 GCG--TCGGT---------ACCAGAGGGCGTGAG-TACCTTGTA------------CT--
|
|
|
|
chromosom 57 AGTCGAAGTAAATCTTCAATGTTTTGGA------CTCTTAG----CGGATATGCGGCTGA
|
|
60 ------||||---|-|||-----|||||------|||||||----|----||----||-|
|
|
transcrip 39 ------AGTA---C-TCA-----TTGGAATAATGCTCTTAGAAGTC----AT----CT-A
|
|
|
|
chromosom 107 GAAGTACGACA-----TGT---GT----ACATTCATAC--CTGCGT-------GACGGTC
|
|
120 .||||--||||-----|||---||----||.|||--||--|-||||-------|||--|-
|
|
transcrip 75 AAAGT--GACAACGCCTGTTTGGTTATGACGTTC--ACGAC-GCGTCTTAACAGAC--T-
|
|
|
|
chromosom 146 AGCCT----CCCCCGGGACCTCATTG-GGCGAATCTAGGTGTGATA-A-----TTGACA-
|
|
180 |||.|----||..||||------|||-||||--|||.|||-|||||-|-----|||-||-
|
|
transcrip 127 AGCATTAGACCGACGGG------TTGAGGCG--TCTGGGT-TGATACAGCCGTTTG-CAT
|
|
|
|
chromosom 194 CA----CTCTTGGTAAGAAGCACTCT---------TTACCCGATCTCCAAGTACCGACGC
|
|
240 ||----.|||-------||-||||||---------||----|||------|.||||----
|
|
transcrip 177 CAGTGTATCT-------AA-CACTCTGAGGGATAATT----GAT------GAACCG----
|
|
|
|
chromosom 241 CAAGGCCAAGCTCTG-----CGATCTAAAGCTGCCGATCGTAGATCCAAGTCCTCAGCAA
|
|
300 -------------||-----||||----||.|----||-|||----|-|||.|-||.||-
|
|
transcrip 215 -------------TGTTTTCCGAT----AGGT----AT-GTA----C-AGTAC-CACCA-
|
|
|
|
chromosom 296 GCTCGCACGAATACGCAG-------TTCGAAGGCTGGGTGTTGTACGACGGTACGGTTGC
|
|
360 ---|||||||.||---||-------||------|||.|||-----|||||||--------
|
|
transcrip 246 ---CGCACGACTA---AGGACCATTTT------CTGCGTG-----CGACGGT--------
|
|
|
|
chromosom 349 TATAGCACTTTCGCGGTCTCGCTATTTTCAGTTTGACTCACCAGTCAGTATTGTCATCGA
|
|
420 ||-|--|----------------||----------|---|||--|||--------|||--
|
|
transcrip 281 TA-A--A----------------AT----------A---ACC--TCA--------ATC--
|
|
|
|
chromosom 409 CCAACT 415
|
|
480 ---||| 486
|
|
transcrip 297 ---ACT 300
|
|
""",
|
|
)
|
|
alignments2 = aligner.align(transcript, sequence)
|
|
alignment2 = alignments2[0]
|
|
self.assertEqual(alignment2.coordinates.shape[1], 66)
|
|
self.assertEqual(
|
|
str(alignment2),
|
|
"""\
|
|
transcrip 8 TCGGTACCAGAGGGCGTGAGTACCTTGTACTAGTACTCATTGGAATAATGCTCTTAGAAG
|
|
0 ||------------|-------||||---|||------|-||||||--------------
|
|
sequence 0 TC------------C-------CCTT---CTA------A-TGGAAT--------------
|
|
|
|
transcrip 68 TCATCTAAAAGTGACAACGCCTGTTTGGTTATGACGTTCACGACGCGTCTTAACAGACTA
|
|
60 -----------------|.||-------------|--||-|||.|-|||---.||||--|
|
|
sequence 17 -----------------CCCC-------------C--TC-CGAAG-GTC---GCAGA--A
|
|
|
|
transcrip 128 GCATTAGACCGACG--GGTTGAGGCGTCTGGGTTGATACAGCCGTTTGCATCAGTGTATC
|
|
120 ||----|.||-|||--|---||------------|||||------------||||---||
|
|
sequence 38 GC----GGCC-ACGCCG---GA------------GATAC------------CAGT---TC
|
|
|
|
transcrip 186 TAACA---CTCTGAGGGATAATTGATGAACCGTGTTTTCCGATAGGTATGTACAGTACCA
|
|
180 ---||---|||--|||-----|||--||--|-----||----------------||----
|
|
sequence 63 ---CACGCCTC--AGG-----TTG--GA--C-----TT----------------GT----
|
|
|
|
transcrip 243 CCACGCACGACTAAGGACCATTTTCTG--CGTGCGA 277
|
|
240 -----|||-|||-------------||--|--|||| 276
|
|
sequence 84 -----CAC-ACT-------------TGTAC--GCGA 99
|
|
""",
|
|
)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertEqual(alignment.coordinates.shape[1], 78)
|
|
self.assertEqual(
|
|
str(alignment),
|
|
"""\
|
|
chromosom 10 TTGGTTTTGGCTTAACTAGAAGCAA-CC-TGTAAGATTGCCAATTCTTCAGTCGAAGTAA
|
|
0 |.------------------------||-|---------------||--------|----
|
|
sequence 0 TC-----------------------CCCTT---------------CT--------A----
|
|
|
|
chromosom 68 ATCTTCAATGTTTTGGACTCTTAGCGGATATGCGGCTGAGAAGTACGACATGTGTA----
|
|
60 ------|------||||-------------------------------------------
|
|
sequence 10 ------A------TGGA---------------------------------------ATCC
|
|
|
|
chromosom 124 --CATTCATAC--CTGCGT----GACGGTCAGCCT--CCCCCG--GGACCTCATTGGGCG
|
|
120 --|--||---|--.-|-||----||-----|||----||-.||--|---------|----
|
|
sequence 19 CCC--TC---CGAA-G-GTCGCAGA-----AGC--GGCC-ACGCCG---------G----
|
|
|
|
chromosom 172 AATCTAGGTGT-GATAATTGACA-CAC--TCTTGGTAAGAAGCA---CTCT---TTACCC
|
|
180 ------------||||--------||---||-----------||---|||----||----
|
|
sequence 51 -----------AGATA-------CCA-GTTC-----------CACGCCTC-AGGTT----
|
|
|
|
chromosom 222 GATCTCCAAGTACCGACGCCAAGGCCAAGCTCTGCGATCTAAAGCTGCCGATCGTAGATC
|
|
240 |--------|.--|----------------------------------------------
|
|
sequence 76 G--------GA--C----------------------------------------------
|
|
|
|
chromosom 282 CAA--GTCCTCAGCAAGCTCGCACGAATACGCAGTTCGAAGGCTG--GGTGTTGTACGA
|
|
300 -----||--------------|||-|.|---------------||--.--|-----|||
|
|
sequence 80 ---TTGT--------------CAC-ACT---------------TGTAC--G-----CGA
|
|
|
|
chromosom 337
|
|
359
|
|
sequence 99
|
|
""",
|
|
)
|
|
line = format(alignment, "psl")
|
|
self.assertEqual(
|
|
line,
|
|
"""\
|
|
61 6 0 0 14 32 28 260 + sequence 100 0 99 chromosome 440 10 337 35 2,2,1,2,1,1,4,1,2,1,1,1,2,2,3,2,3,1,1,4,2,2,2,3,2,1,2,1,2,3,3,2,1,1,3, 0,3,6,7,9,10,11,21,22,24,27,28,29,35,37,42,44,49,50,52,57,61,63,68,74,76,77,79,82,84,87,90,94,95,96, 10,35,37,53,63,74,81,124,127,132,133,135,137,139,146,151,154,157,167,183,194,197,210,212,216,222,231,235,285,301,305,323,325,328,334,
|
|
""",
|
|
)
|
|
|
|
|
|
def map_check(alignment1, alignment2):
|
|
line1 = format(alignment1, "psl")
|
|
handle = open("transcript.psl", "w")
|
|
handle.write(line1)
|
|
handle.close()
|
|
line2 = format(alignment2, "psl")
|
|
handle = open("sequence.psl", "w")
|
|
handle.write(line2)
|
|
handle.close()
|
|
stdout = os.popen("pslMap sequence.psl transcript.psl stdout")
|
|
line = stdout.read()
|
|
os.remove("transcript.psl")
|
|
os.remove("sequence.psl")
|
|
return line
|
|
|
|
|
|
def test_random(aligner, nBlocks1=1, nBlocks2=1, strand1="+", strand2="+"):
|
|
chromosome = "".join(["ACGT"[random.randint(0, 3)] for i in range(1000)])
|
|
nBlocks = nBlocks1
|
|
transcript = ""
|
|
position = 0
|
|
for i in range(nBlocks):
|
|
position += random.randint(60, 80)
|
|
blockSize = random.randint(60, 80)
|
|
transcript += chromosome[position : position + blockSize]
|
|
position += blockSize
|
|
nBlocks = nBlocks2
|
|
sequence = ""
|
|
position = 0
|
|
for i in range(nBlocks):
|
|
position += random.randint(20, 40)
|
|
blockSize = random.randint(20, 40)
|
|
sequence += transcript[position : position + blockSize]
|
|
position += blockSize
|
|
chromosome = Seq(chromosome)
|
|
transcript = Seq(transcript)
|
|
sequence = Seq(sequence)
|
|
if strand1 == "-":
|
|
chromosome = chromosome.reverse_complement()
|
|
if strand2 == "-":
|
|
sequence = sequence.reverse_complement()
|
|
chromosome.id = "chromosome"
|
|
transcript.id = "transcript"
|
|
sequence.id = "sequence"
|
|
alignments1 = aligner.align(chromosome, transcript, strand=strand1)
|
|
alignment1 = alignments1[0]
|
|
alignments2 = aligner.align(transcript, sequence, strand=strand2)
|
|
alignment2 = alignments2[0]
|
|
alignment = alignment1.map(alignment2)
|
|
line_check = map_check(alignment1, alignment2)
|
|
line = format(alignment, "psl")
|
|
assert line == line_check
|
|
print("Randomized test %d, %d, %s, %s OK" % (nBlocks1, nBlocks2, strand1, strand2))
|
|
|
|
|
|
def test_random_sequences(aligner, strand1="+", strand2="+"):
|
|
chromosome = "".join(["ACGT"[random.randint(0, 3)] for i in range(1000)])
|
|
transcript = "".join(["ACGT"[random.randint(0, 3)] for i in range(300)])
|
|
sequence = "".join(["ACGT"[random.randint(0, 3)] for i in range(100)])
|
|
chromosome = Seq(chromosome)
|
|
transcript = Seq(transcript)
|
|
sequence = Seq(sequence)
|
|
chromosome.id = "chromosome"
|
|
transcript.id = "transcript"
|
|
sequence.id = "sequence"
|
|
alignments = aligner.align(chromosome, transcript, strand=strand1)
|
|
alignment1 = alignments[0]
|
|
alignments = aligner.align(transcript, sequence, strand=strand2)
|
|
alignment2 = alignments[0]
|
|
line_check = map_check(alignment1, alignment2)
|
|
alignment = alignment1.map(alignment2)
|
|
line_check = line_check.split()
|
|
line = format(alignment, "psl")
|
|
line = line.split()
|
|
assert line[8:] == line_check[8:]
|
|
line1 = format(alignment1, "psl")
|
|
words = line1.split()
|
|
nBlocks1 = int(words[17])
|
|
line2 = format(alignment2, "psl")
|
|
words = line2.split()
|
|
nBlocks2 = int(words[17])
|
|
print(
|
|
"Randomized sequence test %d, %d, %s, %s OK"
|
|
% (nBlocks1, nBlocks2, strand1, strand2)
|
|
)
|
|
|
|
|
|
def perform_randomized_tests(n=1000):
|
|
"""Perform randomized tests and compare to pslMap.
|
|
|
|
Run this function to perform 8 x n mappings for alignments of randomly
|
|
generated sequences, get the alignment in PSL format, and compare the
|
|
result to that of pslMap.
|
|
"""
|
|
aligner = PairwiseAligner()
|
|
aligner.internal_open_gap_score = -1
|
|
aligner.internal_extend_gap_score = -0.0
|
|
aligner.match_score = +1
|
|
aligner.mismatch_score = -1
|
|
aligner.mode = "local"
|
|
for i in range(n):
|
|
nBlocks1 = random.randint(1, 10)
|
|
nBlocks2 = random.randint(1, 10)
|
|
test_random(aligner, nBlocks1, nBlocks2, "+", "+")
|
|
test_random(aligner, nBlocks1, nBlocks2, "+", "-")
|
|
test_random(aligner, nBlocks1, nBlocks2, "-", "+")
|
|
test_random(aligner, nBlocks1, nBlocks2, "-", "-")
|
|
test_random_sequences(aligner, "+", "+")
|
|
test_random_sequences(aligner, "+", "-")
|
|
test_random_sequences(aligner, "-", "+")
|
|
test_random_sequences(aligner, "-", "-")
|
|
|
|
|
|
class TestZeroGaps(unittest.TestCase):
|
|
def test1(self):
|
|
coordinates = np.array([[0, 3, 6, 9], [0, 3, 6, 9]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 9), id="genome"),
|
|
SeqRecord(Seq(None, 9), id="mRNA"),
|
|
]
|
|
alignment1 = Alignment(sequences, coordinates)
|
|
coordinates = np.array([[0, 9], [0, 9]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 9), id="mRNA"),
|
|
SeqRecord(Seq(None, 9), id="tag"),
|
|
]
|
|
alignment2 = Alignment(sequences, coordinates)
|
|
alignment = alignment1.map(alignment2)
|
|
self.assertTrue(
|
|
np.array_equal(
|
|
alignment.coordinates, np.array([[0, 3, 6, 9], [0, 3, 6, 9]])
|
|
)
|
|
)
|
|
|
|
def test2(self):
|
|
coordinates = np.array([[0, 69], [0, 69]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 69), id="genome"),
|
|
SeqRecord(Seq(None, 69), id="mRNA"),
|
|
]
|
|
alignment1 = Alignment(sequences, coordinates)
|
|
coordinates = np.array([[0, 24, 24, 69], [0, 24, 24, 69]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 69), id="mRNA"),
|
|
SeqRecord(Seq(None, 68), id="tag"),
|
|
]
|
|
alignment2 = Alignment(sequences, coordinates)
|
|
alignment = alignment1.map(alignment2)
|
|
# fmt: off
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates,
|
|
np.array([[0, 24, 69], # noqa: E128
|
|
[0, 24, 69]]))
|
|
)
|
|
# fmt: on
|
|
|
|
def test3(self):
|
|
coordinates = np.array([[0, 69], [0, 69]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 69), id="genome"),
|
|
SeqRecord(Seq(None, 69), id="mRNA"),
|
|
]
|
|
alignment1 = Alignment(sequences, coordinates)
|
|
coordinates = np.array([[0, 24, 24, 69], [0, 24, 23, 68]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 69), id="mRNA"),
|
|
SeqRecord(Seq(None, 68), id="tag"),
|
|
]
|
|
alignment2 = Alignment(sequences, coordinates)
|
|
alignment = alignment1.map(alignment2)
|
|
# fmt: off
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates,
|
|
np.array([[0, 24, 24, 69], # noqa: E128
|
|
[0, 24, 23, 68]]))
|
|
)
|
|
# fmt: on
|
|
|
|
def test4(self):
|
|
coordinates = np.array([[0, 210], [0, 210]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 210), id="genome"),
|
|
SeqRecord(Seq(None, 210), id="mRNA"),
|
|
]
|
|
alignment1 = Alignment(sequences, coordinates)
|
|
coordinates = np.array([[0, 18, 18, 102, 102, 210], [0, 18, 17, 101, 100, 208]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 210), id="mRNA"),
|
|
SeqRecord(Seq(None, 208), id="tag"),
|
|
]
|
|
alignment2 = Alignment(sequences, coordinates)
|
|
alignment = alignment1.map(alignment2)
|
|
# fmt: off
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates,
|
|
np.array([[0, 18, 18, 102, 102, 210],
|
|
[0, 18, 17, 101, 100, 208]]))
|
|
)
|
|
# fmt: on
|
|
|
|
def test5(self):
|
|
coordinates = np.array([[0, 210], [0, 210]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 210), id="genome"),
|
|
SeqRecord(Seq(None, 210), id="mRNA"),
|
|
]
|
|
alignment1 = Alignment(sequences, coordinates)
|
|
coordinates = np.array([[0, 51, 51, 210], [0, 51, 49, 208]])
|
|
sequences = [
|
|
SeqRecord(Seq(None, 210), id="mRNA"),
|
|
SeqRecord(Seq(None, 208), id="tag"),
|
|
]
|
|
alignment2 = Alignment(sequences, coordinates)
|
|
alignment = alignment1.map(alignment2)
|
|
# fmt: off
|
|
self.assertTrue(
|
|
np.array_equal(alignment.coordinates,
|
|
np.array([[0, 51, 51, 210],
|
|
[0, 51, 49, 208]]))
|
|
)
|
|
# fmt: on
|
|
|
|
|
|
class TestLiftOver(unittest.TestCase):
|
|
def test_chimp(self):
|
|
chain = Align.read("Blat/panTro5ToPanTro6.over.chain", "chain")
|
|
alignment = Align.read("Blat/est.panTro5.psl", "psl")
|
|
self.assertEqual(chain.target.id, alignment.target.id)
|
|
self.assertEqual(len(chain.target.seq), len(alignment.target.seq))
|
|
chain = chain[::-1]
|
|
record = SeqIO.read("Blat/est.fa", "fasta")
|
|
self.assertEqual(record.id, alignment.query.id)
|
|
self.assertEqual(len(record.seq), len(alignment.query.seq))
|
|
alignment.query = record.seq
|
|
record = SeqIO.read("Blat/panTro5.fa", "fasta")
|
|
chromosome, start_end = record.id.split(":")
|
|
start, end = start_end.split("-")
|
|
start = int(start)
|
|
end = int(end)
|
|
data = {start: str(record.seq)}
|
|
length = len(alignment.target.seq)
|
|
seq = Seq(data, length=length)
|
|
record = SeqRecord(seq, id="chr1")
|
|
alignment.target = record
|
|
text = """^\
|
|
chr1 122835789 AGAGATTATTTTGCAGAGGATGATGGGGAGATGGTACCCAGAACGAGTCACACAGCAGGT
|
|
0 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::--
|
|
query 32 AGAGATTATTTTGCAGAGGATGATGGGGAGATGGTACCCAGAACGAGTCACACAGCAG--
|
|
|
|
chr1 122835849 AAGGATGCTGTGGGCCTTGCCTTGTTAAATTCTTTGtttcttttgtttattcatttggtt
|
|
60 ------------------------------------------------------------
|
|
query 90 ------------------------------------------------------------
|
|
((.|\n)*)
|
|
chr1 122840889 TTGTATTTTGCAGAAACTGAATTCTGCTGGAATGTGCCAGTTAGAATGATCCTAGTGCTG
|
|
5100 ------------------------------------------------------------
|
|
query 90 ------------------------------------------------------------
|
|
|
|
chr1 122840949 TTATTATATAAACCTTTTTTGTTGTTGTTCTGTTTCATTGACAGCTTTTCTTAGTGACAC
|
|
5160 --------------------------------------------::::::::::::::::
|
|
query 90 --------------------------------------------CTTTTCTTAGTGACAC
|
|
|
|
chr1 122841009 TAAAGATCGAGGCCCTCCAGTGCAGTCACAGATCTGGAGAAGTGGTGAAAAGGTCCCGTT
|
|
5220 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::X:
|
|
query 106 TAAAGATCGAGGCCCTCCAGTGCAGTCACAGATCTGGAGAAGTGGTGAAAAGGTCCCGNT
|
|
|
|
chr1 122841069 TGTGCAGACATATTCCTTGAGAGCATTTGAGAAACCCCCTCAGGTACAGACCCAGGCTCT
|
|
5280 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
|
query 166 TGTGCAGACATATTCCTTGAGAGCATTTGAGAAACCCCCTCAGGTACAGACCCAGGCTCT
|
|
|
|
chr1 122841129 TCGAGACTTTGAGAAGGTAAGTCATGTGAGTGGATAATTGTTATCCCAATTAGAAGCAGT
|
|
5340 ::::::::::::::::--------------------------------------------
|
|
query 226 TCGAGACTTTGAGAAG--------------------------------------------
|
|
|
|
chr1 122841189 ACTATGGAATAGTGATGCCTGATAAAAATATGACCCATGGATTGGTCCGGATTATGGATG
|
|
5400 ------------------------------------------------------------
|
|
query 242 ------------------------------------------------------------
|
|
((.|\n)*)
|
|
chr1 122907129 GTTCTTGGGTTGAGGGGGCAATCGGGCACGCTCCTCCCCATGGGTTGCCCATCATGTCTA
|
|
71340 ------------------------------------------------------------
|
|
query 242 ------------------------------------------------------------
|
|
|
|
chr1 122907189 ATGGATATCGCACTCTGTCCCAGCACCTCAATGACCTGAAGAAGGAGAACTTCAGCCTCA
|
|
71400 -----------------------:::::::::::::::::::::::::::::::::::::
|
|
query 242 -----------------------CACCTCAATGACCTGAAGAAGGAGAACTTCAGCCTCA
|
|
|
|
chr1 122907249 AGCTGCGCATCTACTTCCTGGAGGAGCGCATGCAACAGAAGTATGAGGCCAGCCGGGAGG
|
|
71460 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
|
query 279 AGCTGCGCATCTACTTCCTGGAGGAGCGCATGCAACAGAAGTATGAGGCCAGCCGGGAGG
|
|
|
|
chr1 122907309 ACATC 122907314
|
|
71520 ::::: 71525
|
|
query 339 ACATC 344
|
|
$"""
|
|
self.assertRegex(str(alignment).replace("|", ":").replace(".", "X"), text)
|
|
lifted_alignment = chain.map(alignment)
|
|
# fmt: off
|
|
self.assertTrue(
|
|
np.array_equal(lifted_alignment.coordinates,
|
|
np.array([[111982717, 111982775, 111987921,
|
|
111988073, 112009200, 112009302],
|
|
[ 32, 90, 90,
|
|
242, 242, 344]])
|
|
)
|
|
)
|
|
# fmt: on
|
|
record = SeqIO.read("Blat/panTro6.fa", "fasta")
|
|
chromosome, start_end = record.id.split(":")
|
|
start, end = start_end.split("-")
|
|
start = int(start)
|
|
end = int(end)
|
|
data = {start: str(record.seq)}
|
|
length = len(alignment.target.seq)
|
|
seq = Seq(data, length=length)
|
|
record = SeqRecord(seq, id="chr1")
|
|
lifted_alignment.target = record
|
|
text = """^\
|
|
chr1 111982717 AGAGATTATTTTGCAGAGGATGATGGGGAGATGGTACCCAGAACGAGTCACACAGCAGGT
|
|
0 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::--
|
|
query 32 AGAGATTATTTTGCAGAGGATGATGGGGAGATGGTACCCAGAACGAGTCACACAGCAG--
|
|
|
|
chr1 111982777 AAGGATGCTGTGGGCCTTGCCTTGTTAAATTCTTTGtttcttttgtttattcatttggtt
|
|
60 ------------------------------------------------------------
|
|
query 90 ------------------------------------------------------------
|
|
((.|\n)*)
|
|
chr1 111987817 TTGTATTTTGCAGAAACTGAATTCTGCTGGAATGTGCCAGTTAGAATGATCCTAGTGCTG
|
|
5100 ------------------------------------------------------------
|
|
query 90 ------------------------------------------------------------
|
|
|
|
chr1 111987877 TTATTATATAAACCTTTTTTGTTGTTGTTCTGTTTCATTGACAGCTTTTCTTAGTGACAC
|
|
5160 --------------------------------------------::::::::::::::::
|
|
query 90 --------------------------------------------CTTTTCTTAGTGACAC
|
|
|
|
chr1 111987937 TAAAGATCGAGGCCCTCCAGTGCAGTCACAGATCTGGAGAAGTGGTGAAAAGGTCCCGTT
|
|
5220 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::X:
|
|
query 106 TAAAGATCGAGGCCCTCCAGTGCAGTCACAGATCTGGAGAAGTGGTGAAAAGGTCCCGNT
|
|
|
|
chr1 111987997 TGTGCAGACATATTCCTTGAGAGCATTTGAGAAACCCCCTCAGGTACAGACCCAGGCTCT
|
|
5280 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
|
query 166 TGTGCAGACATATTCCTTGAGAGCATTTGAGAAACCCCCTCAGGTACAGACCCAGGCTCT
|
|
|
|
chr1 111988057 TCGAGACTTTGAGAAGGTAAGTCATGTGAGTGGATAATTGTTATCCCAATTAGAAGCAGT
|
|
5340 ::::::::::::::::--------------------------------------------
|
|
query 226 TCGAGACTTTGAGAAG--------------------------------------------
|
|
|
|
chr1 111988117 ACTATGGAATAGTGATGCCTGATAAAAATATGACCCATGGATTGGTCCGGATTATGGATG
|
|
5400 ------------------------------------------------------------
|
|
query 242 ------------------------------------------------------------
|
|
((.|\n)*)
|
|
chr1 112009117 GTTCTTGGGTTGAGGGGGCAATCGGGCACGCTCCTCCCCATGGGTTGCCCATCATGTCTA
|
|
26400 ------------------------------------------------------------
|
|
query 242 ------------------------------------------------------------
|
|
|
|
chr1 112009177 ATGGATATCGCACTCTGTCCCAGCACCTCAATGACCTGAAGAAGGAGAACTTCAGCCTCA
|
|
26460 -----------------------:::::::::::::::::::::::::::::::::::::
|
|
query 242 -----------------------CACCTCAATGACCTGAAGAAGGAGAACTTCAGCCTCA
|
|
|
|
chr1 112009237 AGCTGCGCATCTACTTCCTGGAGGAGCGCATGCAACAGAAGTATGAGGCCAGCCGGGAGG
|
|
26520 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
|
query 279 AGCTGCGCATCTACTTCCTGGAGGAGCGCATGCAACAGAAGTATGAGGCCAGCCGGGAGG
|
|
|
|
chr1 112009297 ACATC 112009302
|
|
26580 ::::: 26585
|
|
query 339 ACATC 344
|
|
$"""
|
|
self.assertRegex(
|
|
str(lifted_alignment).replace("|", ":").replace(".", "X"), text
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
runner = unittest.TextTestRunner(verbosity=2)
|
|
unittest.main(testRunner=runner)
|