Files
biopython/Tests/test_SearchIO_infernal_text_index.py
2025-03-22 06:59:36 +01:00

530 lines
37 KiB
Python

# Copyright 2024 by Samuel Prince. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Tests for SearchIO InfernalIO infernal-text indexing"""
import os
import unittest
from search_tests_common import CheckIndex
from search_tests_common import CheckRaw
class InfernalTabRawCases(CheckRaw):
fmt = "infernal-text"
def test_infernal_text_1q(self):
"""Test infernal-text raw string retrieval, cmsearch, one query (U2_Yeast)."""
filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.txt")
raw = """# cmsearch :: search CM(s) against a sequence database
# INFERNAL 1.1.4 (Dec 2020)
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query CM file: RF00004.cm
# target sequence database: GCA_000146045.2.fasta
# tabular output of hits: U2_Yeast-threshold.tbl
# sequence reporting threshold: score >= 46
# number of worker threads: 56
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Query: U2 [CLEN=193]
Accession: RF00004
Description: U2 spliceosomal RNA
Hit scores:
rank E-value score bias sequence start end mdl trunc gc description
---- --------- ------ ----- ----------------------- ------ ------ --- ----- ---- -----------
(1) ! 5.9e-20 98.7 0.1 ENA|BK006936|BK006936.2 681858 681747 - cm no 0.33 TPA_inf: Saccharomyces cerevisiae S288C chromosome II,
Hit alignments:
>> ENA|BK006936|BK006936.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(1) ! 5.9e-20 98.7 0.1 cm 1 193 [] 681858 681747 - .. 0.91 no 0.33
v NC
::::::<<<-<<<<____>>>>->>>,,,,,,,,,,,,,,,,,,,,<<<<<<________>>>>>>,<<<<<<<___>>> CS
U2 1 AUacCUUCucgGCcUUUUgGCuaaGAUCAAGUGUAGUAUCUGUUCUUauCAGUuUAAuAuCUGauAuggcccccAuuggg 80
AU+ UCU+:GCCUUUUGGC:+AGAUCAAGUGUAGUAUCUGUUCUU:UCAGU+UAA+A+CUGA:AUG: CC:CA+UG:G
ENA|BK006936|BK006936.2 681858 AUC---UCUUUGCCUUUUGGCUUAGAUCAAGUGUAGUAUCUGUUCUUUUCAGUGUAACAACUGAAAUGA-CCUCAAUGAG 681783
***...************************************************************999.********** PP
v NC
>>>>,,,.,,,,,,,,,,,,,,,,,,,,,,,,,~~~~~~~~~~~~::::::: CS
U2 81 ggccaau.uauaUUAaauuaAUUUUUggaacua*[34]**[40]*Acccuuu 193
G+:CA+U U+UUAA+UU AC +UUU
ENA|BK006936|BK006936.2 681782 GCUCAUUaCCUUUUAAUUUG-------------*[ 6]**[ 3]*ACAUUUU 681747
******86555555555443................7.....9..******* PP
Internal CM pipeline statistics summary:
----------------------------------------
Query model(s): 1 (193 consensus positions)
Target sequences: 16 (24142652 residues searched)
Target sequences re-searched for truncated hits: 16 (15424 residues re-searched)
Windows passing local HMM SSV filter: 72732 (0.7978); expected (0.35)
Windows passing local HMM Viterbi filter: 24175 (0.3233); expected (0.15)
Windows passing local HMM Viterbi bias filter: 6671 (0.09669); expected (0.15)
Windows passing local HMM Forward filter: 2037 (0.03161); expected (0.003)
Windows passing local HMM Forward bias filter: 1133 (0.01757); expected (0.003)
Windows passing glocal HMM Forward filter: 596 (0.01251); expected (0.003)
Windows passing glocal HMM Forward bias filter: 438 (0.009175); expected (0.003)
Envelopes passing glocal HMM envelope defn filter: 460 (0.00429); expected (0.003)
Envelopes passing local CM CYK filter: 38 (0.000201); expected (0.0001)
Total CM hits reported: 1 (4.636e-06); includes 0 truncated hit(s)
# CPU time: 64.67u 2.16s 00:01:06.83 Elapsed: 00:00:03.09
//
"""
self.check_raw(filename, "U2", raw)
def test_infernal_text_mq_first(self):
"""Test infernal-text raw string retrieval, cmsearch, multiple queries, first (IRES_5S_U2_Yeast)."""
filename = os.path.join("Infernal", "cmsearch_114_IRES_5S_U2_Yeast.txt")
raw = """# cmsearch :: search CM(s) against a sequence database
# INFERNAL 1.1.4 (Dec 2020)
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query CM file: IRES_5S_U2.cm
# target sequence database: GCA_000146045.2.fasta
# tabular output of hits: IRES_5S_U2_Yeast.tbl
# number of worker threads: 56
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Query: IRES_HCV [CLEN=352]
Accession: RF00061
Description: Hepatitis C virus internal ribosome entry site
Hit scores:
rank E-value score bias sequence start end mdl trunc gc description
---- --------- ------ ----- -------- ------ ------ --- ----- ---- -----------
[No hits detected that satisfy reporting thresholds]
Hit alignments:
[No hits detected that satisfy reporting thresholds]
Internal CM pipeline statistics summary:
----------------------------------------
Query model(s): 1 (352 consensus positions)
Target sequences: 16 (24142652 residues searched)
Target sequences re-searched for truncated hits: 16 (28160 residues re-searched)
Windows passing local HMM SSV filter: 6432 (0.1374); expected (0.35)
Windows passing local HMM Viterbi filter: 1631 (0.03555); expected (0.15)
Windows passing local HMM Viterbi bias filter: 1607 (0.03502); expected (0.15)
Windows passing local HMM Forward filter: 9 (0.0001992); expected (0.003)
Windows passing local HMM Forward bias filter: 9 (0.0001992); expected (0.003)
Windows passing glocal HMM Forward filter: 1 (2.16e-05); expected (0.003)
Windows passing glocal HMM Forward bias filter: 1 (2.16e-05); expected (0.003)
Envelopes passing glocal HMM envelope defn filter: 1 (1.705e-05); expected (0.003)
Envelopes passing local CM CYK filter: 0 (0); expected (0.0001)
Total CM hits reported: 0 (0); includes 0 truncated hit(s)
# CPU time: 3.98u 0.19s 00:00:04.17 Elapsed: 00:00:00.87
//
"""
self.check_raw(filename, "IRES_HCV", raw)
def test_infernal_text_mq_middle(self):
"""Test infernal-text raw string retrieval, cmsearch, multiple queries, middle (IRES_5S_U2_Yeast)."""
filename = os.path.join("Infernal", "cmsearch_114_IRES_5S_U2_Yeast.txt")
raw = """# cmsearch :: search CM(s) against a sequence database
# INFERNAL 1.1.4 (Dec 2020)
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query CM file: IRES_5S_U2.cm
# target sequence database: GCA_000146045.2.fasta
# tabular output of hits: IRES_5S_U2_Yeast.tbl
# number of worker threads: 56
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Query: 5S_rRNA [CLEN=119]
Accession: RF00001
Description: 5S ribosomal RNA
Hit scores:
rank E-value score bias sequence start end mdl trunc gc description
---- --------- ------ ----- ----------------------- ------ ------ --- ----- ---- -----------
(1) ! 1.6e-18 88.8 0.0 ENA|BK006945|BK006945.2 459676 459796 + cm no 0.52 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII
(2) ! 1.6e-18 88.8 0.0 ENA|BK006945|BK006945.2 489349 489469 + cm no 0.52 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII
(3) ! 4.4e-17 83.2 0.0 ENA|BK006945|BK006945.2 468813 468933 + cm no 0.53 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII
(4) ! 4.4e-17 83.2 0.0 ENA|BK006945|BK006945.2 472465 472585 + cm no 0.53 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII
(5) ! 4.4e-17 83.2 0.0 ENA|BK006945|BK006945.2 482045 482165 + cm no 0.53 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII
(6) ! 4.4e-17 83.2 0.0 ENA|BK006945|BK006945.2 485697 485817 + cm no 0.53 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII
------ inclusion threshold ------
(7) ? 0.56 20.9 0.0 ENA|BK006943|BK006943.2 357031 357144 + cm no 0.46 TPA_inf: Saccharomyces cerevisiae S288C chromosome X,
(8) ? 6.6 16.7 0.3 ENA|BK006947|BK006947.3 7085 6968 - cm no 0.41 TPA_inf: Saccharomyces cerevisiae S288C chromosome XIV
Hit alignments:
>> ENA|BK006945|BK006945.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(1) ! 1.6e-18 88.8 0.0 cm 1 119 [] 459676 459796 + .. 0.99 no 0.52
v NC
(((((((((,,,,<<-<<<<<---<<--<<<<<<______>>-->>>.>-->>---->>>>>-->><<<-<<---.-<-< CS
5S_rRNA 1 gccuGcggcCAUAccagcgcgaAagcACcgGauCCCAUCcGaACuCc.gAAguUAAGcgcgcUugggCcagggUA.GUAc 78
G::UGC:GCCAUA:C :C::GAAAGCACCG :UCCC+UCCGA C: C G AGUUAAGC::G: +G:GCC G: GUA
ENA|BK006945|BK006945.2 459676 GGUUGCGGCCAUAUCUACCAGAAAGCACCGUUUCCCGUCCGAUCAACuGUAGUUAAGCUGGUAAGAGCCUGACCGaGUAG 459755
***********************************************99***********************8756**** PP
v vv NC
<-----<<____>>----->>->-->>->>>))))))))): CS
5S_rRNA 79 uagGaUGgGuGAcCuCcUGggAAgaccagGugccgCaggcc 119
+ +UGGGUGACC+ G AA :CAGGUGC:GCA::C+
ENA|BK006945|BK006945.2 459756 UGUAGUGGGUGACCAUACGCGAAACUCAGGUGCUGCAAUCU 459796
***********************9***************** PP
>> ENA|BK006945|BK006945.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(2) ! 1.6e-18 88.8 0.0 cm 1 119 [] 489349 489469 + .. 0.99 no 0.52
v NC
(((((((((,,,,<<-<<<<<---<<--<<<<<<______>>-->>>.>-->>---->>>>>-->><<<-<<---.-<-< CS
5S_rRNA 1 gccuGcggcCAUAccagcgcgaAagcACcgGauCCCAUCcGaACuCc.gAAguUAAGcgcgcUugggCcagggUA.GUAc 78
G::UGC:GCCAUA:C :C::GAAAGCACCG :UCCC+UCCGA C: C G AGUUAAGC::G: +G:GCC G: GUA
ENA|BK006945|BK006945.2 489349 GGUUGCGGCCAUAUCUACCAGAAAGCACCGUUUCCCGUCCGAUCAACuGUAGUUAAGCUGGUAAGAGCCUGACCGaGUAG 489428
***********************************************99***********************8756**** PP
v vv NC
<-----<<____>>----->>->-->>->>>))))))))): CS
5S_rRNA 79 uagGaUGgGuGAcCuCcUGggAAgaccagGugccgCaggcc 119
+ +UGGGUGACC+ G AA :CAGGUGC:GCA::C+
ENA|BK006945|BK006945.2 489429 UGUAGUGGGUGACCAUACGCGAAACUCAGGUGCUGCAAUCU 489469
***********************9***************** PP
>> ENA|BK006945|BK006945.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(3) ! 4.4e-17 83.2 0.0 cm 1 119 [] 468813 468933 + .. 0.99 no 0.53
v NC
(((((((((,,,,<<-<<<<<---<<--<<<<<<______>>-->>>.>-->>---->>>>>-->><<<-<<---.-<-< CS
5S_rRNA 1 gccuGcggcCAUAccagcgcgaAagcACcgGauCCCAUCcGaACuCc.gAAguUAAGcgcgcUugggCcagggUA.GUAc 78
: UGC:GCCAUA:C :C::GAAAGCACCG :UCCC+UCCGA C: C G AGUUAAGC::G: +G:GCC G: GUA
ENA|BK006945|BK006945.2 468813 GGUUGCGGCCAUAUCUACCAGAAAGCACCGUUUCCCGUCCGAUCAACuGUAGUUAAGCUGGUAAGAGCCUGACCGaGUAG 468892
***********************************************99***********************8756**** PP
v vv NC
<-----<<____>>----->>->-->>->>>))))))))): CS
5S_rRNA 79 uagGaUGgGuGAcCuCcUGggAAgaccagGugccgCaggcc 119
+ +UGGGUGACC+ G AA :CAGGUGC:GCA :
ENA|BK006945|BK006945.2 468893 UGUAGUGGGUGACCAUACGCGAAACUCAGGUGCUGCAGUUG 468933
***********************9***************** PP
>> ENA|BK006945|BK006945.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(4) ! 4.4e-17 83.2 0.0 cm 1 119 [] 472465 472585 + .. 0.99 no 0.53
v NC
(((((((((,,,,<<-<<<<<---<<--<<<<<<______>>-->>>.>-->>---->>>>>-->><<<-<<---.-<-< CS
5S_rRNA 1 gccuGcggcCAUAccagcgcgaAagcACcgGauCCCAUCcGaACuCc.gAAguUAAGcgcgcUugggCcagggUA.GUAc 78
: UGC:GCCAUA:C :C::GAAAGCACCG :UCCC+UCCGA C: C G AGUUAAGC::G: +G:GCC G: GUA
ENA|BK006945|BK006945.2 472465 GGUUGCGGCCAUAUCUACCAGAAAGCACCGUUUCCCGUCCGAUCAACuGUAGUUAAGCUGGUAAGAGCCUGACCGaGUAG 472544
***********************************************99***********************8756**** PP
v vv NC
<-----<<____>>----->>->-->>->>>))))))))): CS
5S_rRNA 79 uagGaUGgGuGAcCuCcUGggAAgaccagGugccgCaggcc 119
+ +UGGGUGACC+ G AA :CAGGUGC:GCA :
ENA|BK006945|BK006945.2 472545 UGUAGUGGGUGACCAUACGCGAAACUCAGGUGCUGCAGUUG 472585
***********************9***************** PP
>> ENA|BK006945|BK006945.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(5) ! 4.4e-17 83.2 0.0 cm 1 119 [] 482045 482165 + .. 0.99 no 0.53
v NC
(((((((((,,,,<<-<<<<<---<<--<<<<<<______>>-->>>.>-->>---->>>>>-->><<<-<<---.-<-< CS
5S_rRNA 1 gccuGcggcCAUAccagcgcgaAagcACcgGauCCCAUCcGaACuCc.gAAguUAAGcgcgcUugggCcagggUA.GUAc 78
: UGC:GCCAUA:C :C::GAAAGCACCG :UCCC+UCCGA C: C G AGUUAAGC::G: +G:GCC G: GUA
ENA|BK006945|BK006945.2 482045 GGUUGCGGCCAUAUCUACCAGAAAGCACCGUUUCCCGUCCGAUCAACuGUAGUUAAGCUGGUAAGAGCCUGACCGaGUAG 482124
***********************************************99***********************8756**** PP
v vv NC
<-----<<____>>----->>->-->>->>>))))))))): CS
5S_rRNA 79 uagGaUGgGuGAcCuCcUGggAAgaccagGugccgCaggcc 119
+ +UGGGUGACC+ G AA :CAGGUGC:GCA :
ENA|BK006945|BK006945.2 482125 UGUAGUGGGUGACCAUACGCGAAACUCAGGUGCUGCAGUUG 482165
***********************9***************** PP
>> ENA|BK006945|BK006945.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XII, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(6) ! 4.4e-17 83.2 0.0 cm 1 119 [] 485697 485817 + .. 0.99 no 0.53
v NC
(((((((((,,,,<<-<<<<<---<<--<<<<<<______>>-->>>.>-->>---->>>>>-->><<<-<<---.-<-< CS
5S_rRNA 1 gccuGcggcCAUAccagcgcgaAagcACcgGauCCCAUCcGaACuCc.gAAguUAAGcgcgcUugggCcagggUA.GUAc 78
: UGC:GCCAUA:C :C::GAAAGCACCG :UCCC+UCCGA C: C G AGUUAAGC::G: +G:GCC G: GUA
ENA|BK006945|BK006945.2 485697 GGUUGCGGCCAUAUCUACCAGAAAGCACCGUUUCCCGUCCGAUCAACuGUAGUUAAGCUGGUAAGAGCCUGACCGaGUAG 485776
***********************************************99***********************8756**** PP
v vv NC
<-----<<____>>----->>->-->>->>>))))))))): CS
5S_rRNA 79 uagGaUGgGuGAcCuCcUGggAAgaccagGugccgCaggcc 119
+ +UGGGUGACC+ G AA :CAGGUGC:GCA :
ENA|BK006945|BK006945.2 485777 UGUAGUGGGUGACCAUACGCGAAACUCAGGUGCUGCAGUUG 485817
***********************9***************** PP
>> ENA|BK006943|BK006943.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome X, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(7) ? 0.56 20.9 0.0 cm 1 119 [] 357031 357144 + .. 0.86 no 0.46
v vv v vv vv v vv vv NC
(((((((((,,,,<<-<<<<<---<<--<<<<<<______>>-->>>>-.->>---->>>>>-->><<<-<<----<-<< CS
5S_rRNA 1 gccuGcggcCAUAccagcgcgaAagcACcgGauCCCAUCcGaACuCcgA.AguUAAGcgcgcUugggCcagggUAGUAcu 79
::: GC:G CA A:: G :G+AA: AC:: ++ CAU C ::A A :UAAGC: CU+:: C G:G GUACU
ENA|BK006943|BK006943.2 357031 CAGGGCUGGCAGAGGUGUCGGGAAAAACAAGGAU-CAUAU--CCUUUUAcAAUUAAGCCAUCUACCACCUGAG--GUACU 357105
****************************888743.44433..555555579**********************..***** PP
v v vvv NC
-----<<____>>----->>->-->>->>>)))))).))): CS
5S_rRNA 80 agGaUGgGuGAcCuCcUGggAAgaccagGugccgCa.ggcc 119
A + G CU C GGGAA+A:C+G C:GC :::+
ENA|BK006943|BK006943.2 357106 AAAG-G-AAAGGCUACCGGGAAUAUCUGAAACAGCUgCUGU 357144
9994.3.33334778889****************9879*** PP
>> ENA|BK006947|BK006947.3 TPA_inf: Saccharomyces cerevisiae S288C chromosome XIV, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(8) ? 6.6 16.7 0.3 cm 1 119 [] 7085 6968 - .. 0.91 no 0.41
v v v v v v v NC
(((((((((,,,,<<-<<.<<<.---<<--<<<<.<<______>>-->>>>-->>---->>>>>-->><<<-<<----<-<<-- CS
5S_rRNA 1 gccuGcggcCAUAccagc.gcg.aAagcACcgGa.uCCCAUCcGaACuCcgAAguUAAGcgcgcUugggCcagggUAGUAcuag 81
: :: ::: AUAC + :: G:AC:::: CC AUC+G ::::AA:U AAG :: U+ GGC: :G GUA U+G
ENA|BK006947|BK006947.3 7085 GAGAUGGUAUAUACUGUAgCAUcCGUGUACGUAUgACCGAUCAGA--AUACAAGUGAAGGUGAGUAUGGCAUGUG--GUAGUGG 7006
**************976325541459999****989999999999..89**********9999999*********..******* PP
v NC
---<<____>>----->>->-->>->>>.))))))))): CS
5S_rRNA 82 GaUGgGuGAcCuCcUGggAAgaccagGu.gccgCaggcc 119
GAU :G G : GG AAG+: A:GU ::: :: : C
ENA|BK006947|BK006947.3 7005 GAUUAGAG-UGGUAGGGUAAGUAUAUGUgUAUUAUUUAC 6968
***99988.689999************************ PP
Internal CM pipeline statistics summary:
----------------------------------------
Query model(s): 1 (119 consensus positions)
Target sequences: 16 (24142652 residues searched)
Target sequences re-searched for truncated hits: 16 (12416 residues re-searched)
Windows passing local HMM SSV filter: 24991 (0.2423); expected (0.35)
Windows passing local HMM Viterbi filter: 8464 (0.08504); expected (0.15)
Windows passing local HMM Viterbi bias filter: 8432 (0.08473); expected (0.15)
Windows passing local HMM Forward filter: 135 (0.001502); expected (0.003)
Windows passing local HMM Forward bias filter: 134 (0.001493); expected (0.003)
Windows passing glocal HMM Forward filter: 65 (0.0007404); expected (0.003)
Windows passing glocal HMM Forward bias filter: 65 (0.0007404); expected (0.003)
Envelopes passing glocal HMM envelope defn filter: 61 (0.0003446); expected (0.003)
Envelopes passing local CM CYK filter: 15 (6.52e-05); expected (0.0001)
Total CM hits reported: 8 (3.966e-05); includes 0 truncated hit(s)
# CPU time: 4.17u 0.21s 00:00:04.38 Elapsed: 00:00:00.40
//
"""
self.check_raw(filename, "5S_rRNA", raw)
def test_infernal_text_mq_last(self):
"""Test infernal-text raw string retrieval, cmsearch, multiple queries, last (IRES_5S_U2_Yeast)."""
filename = os.path.join("Infernal", "cmsearch_114_IRES_5S_U2_Yeast.txt")
raw = """# cmsearch :: search CM(s) against a sequence database
# INFERNAL 1.1.4 (Dec 2020)
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query CM file: IRES_5S_U2.cm
# target sequence database: GCA_000146045.2.fasta
# tabular output of hits: IRES_5S_U2_Yeast.tbl
# number of worker threads: 56
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Query: U2 [CLEN=193]
Accession: RF00004
Description: U2 spliceosomal RNA
Hit scores:
rank E-value score bias sequence start end mdl trunc gc description
---- --------- ------ ----- ----------------------- ------ ------ --- ----- ---- -----------
(1) ! 5.9e-20 98.7 0.1 ENA|BK006936|BK006936.2 681858 681747 - cm no 0.33 TPA_inf: Saccharomyces cerevisiae S288C chromosome II,
------ inclusion threshold ------
(2) ? 0.49 19.8 0.0 ENA|BK006948|BK006948.2 737498 737324 - cm no 0.39 TPA_inf: Saccharomyces cerevisiae S288C chromosome XV,
(3) ? 5.7 15.3 0.0 ENA|BK006947|BK006947.3 266059 266208 + cm no 0.39 TPA_inf: Saccharomyces cerevisiae S288C chromosome XIV
(4) ? 6.6 15.1 0.4 ENA|BK006949|BK006949.2 443393 443253 - cm no 0.32 TPA_inf: Saccharomyces cerevisiae S288C chromosome XVI
(5) ? 7.1 14.9 0.0 ENA|BK006939|BK006939.2 190882 191043 + cm no 0.41 TPA_inf: Saccharomyces cerevisiae S288C chromosome V,
Hit alignments:
>> ENA|BK006936|BK006936.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(1) ! 5.9e-20 98.7 0.1 cm 1 193 [] 681858 681747 - .. 0.91 no 0.33
v NC
::::::<<<-<<<<____>>>>->>>,,,,,,,,,,,,,,,,,,,,<<<<<<________>>>>>>,<<<<<<<___>>> CS
U2 1 AUacCUUCucgGCcUUUUgGCuaaGAUCAAGUGUAGUAUCUGUUCUUauCAGUuUAAuAuCUGauAuggcccccAuuggg 80
AU+ UCU+:GCCUUUUGGC:+AGAUCAAGUGUAGUAUCUGUUCUU:UCAGU+UAA+A+CUGA:AUG: CC:CA+UG:G
ENA|BK006936|BK006936.2 681858 AUC---UCUUUGCCUUUUGGCUUAGAUCAAGUGUAGUAUCUGUUCUUUUCAGUGUAACAACUGAAAUGA-CCUCAAUGAG 681783
***...************************************************************999.********** PP
v NC
>>>>,,,.,,,,,,,,,,,,,,,,,,,,,,,,,~~~~~~~~~~~~::::::: CS
U2 81 ggccaau.uauaUUAaauuaAUUUUUggaacua*[34]**[40]*Acccuuu 193
G+:CA+U U+UUAA+UU AC +UUU
ENA|BK006936|BK006936.2 681782 GCUCAUUaCCUUUUAAUUUG-------------*[ 6]**[ 3]*ACAUUUU 681747
******86555555555443................7.....9..******* PP
>> ENA|BK006948|BK006948.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XV, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(2) ? 0.49 19.8 0.0 cm 1 193 [] 737498 737324 - .. 0.96 no 0.39
NC
::::::<<<-<<<<____>>>>->>>,,,,,,,,,,,,,,,,,,,,<<<<<<~~~~~~>>>>>>,<<<<<<<___>>>>> CS
U2 1 AUacCUUCucgGCcUUUUgGCuaaGAUCAAGUGUAGUAUCUGUUCUUauCAG*[ 8]*CUGauAuggcccccAuuggggg 82
AU+CC U U+ GCC U GGC +A AU AAGU UA UA C GUUCU:A::A U::U: ::::::A U:::::
ENA|BK006948|BK006948.2 737498 AUCCCAUAUUUGCCAUC-GGCAUAUAUUAAGUAUAUUAGCAGUUCUAAUUAC*[88]*GUAGUUGGAAGGAUACUAUCCU 737338
**************999.*******************************996...*..6999999999999999999999 PP
NC
>>,,,,,,,,,,,,,,,,,,,,,,,,,,,,~~~~~~~~~~~~::::::: CS
U2 83 ccaauuauaUUAaauuaAUUUUUggaacua*[34]**[40]*Acccuuu 193
: A+ A CC++U
ENA|BK006948|BK006948.2 737337 UUAU--------------------------*[ 2]**[ 1]*AUCCCCU 737324
9987.............................6.....9..******* PP
>> ENA|BK006947|BK006947.3 TPA_inf: Saccharomyces cerevisiae S288C chromosome XIV, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(3) ? 5.7 15.3 0.0 cm 1 193 [] 266059 266208 + .. 0.91 no 0.39
v v v NC
::::::.<<<.-<<<<____>>>>->>>,,,,.,,,,,,~~~~~~,,,,,,,,,,,,,,,,,,,,,,,,,,,,<<<<<<< CS
U2 1 AUacCU.UCu.cgGCcUUUUgGCuaaGAUCAA.GUGUAG*[48]*aauuauaUUAaauuaAUUUUUggaacuaGuggggg 119
AU UCU + G C UUG C AGAU A GUGUAG UUAUAU +UU AU UUU G +A:: : G:
ENA|BK006947|BK006947.3 266059 AUGUUGaUCUaUCGUCAAUUGACCCAGAUGAUaGUGUAG*[ 1]*-GUUAUAUAGUUUUGAUAUUUUGGCGAAAAGUUGA 266132
*****9****999****************9988999987...5...3377778888888888888888888888888888 PP
v v v v v v v v v NC
<----.<<<<<__>>>>>-..->>>>>>>>,,<<<<<<-<<<<<<___________>>>>>>-->>>>>>::::::: CS
U2 120 cauuu.uggGCUUGCccau..ugcccccaCacggguugaccuggcaUUGCAcUaccgccagguucagcccAcccuuu 193
:A+U+ U :GCUUGC: AU +::C : :: G: :AC: G U GCA UA+ C :GU+: :C +U +
ENA|BK006947|BK006947.3 266133 GAAUAuUGCGCUUGCGUAUauAUUCCAUUUGAGGUGGCACUAGAGCUCGCAUUAU-UACCAGUAGUGGCAGGAUUGC 266208
888888**************99999******************************.99******************* PP
>> ENA|BK006949|BK006949.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome XVI, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(4) ? 6.6 15.1 0.4 cm 1 193 [] 443393 443253 - .. 0.71 no 0.32
v v v v NC
::::::<<<-<<<<____>>>>->>>,,,,,,,,,,,,,,,,,,,,~~~~~~,<<<<<<<___>>>>>>>,,,..,,,,, CS
U2 1 AUacCUUCucgGCcUUUUgGCuaaGAUCAAGUGUAGUAUCUGUUCU*[20]*uggcccccAuugggggccaau..uauaU 92
A U C:+G C G UA:G UCAAG UAGUAU UGUUCU ::: C A+ G :::++U
ENA|BK006949|BK006949.2 443393 CCAUUUACAUGAACCCCAGUUUAUGUUCAAG--UAGUAUAUGUUCU*[ 4]*-UCAACGCAAACUGCUGAUUUcaCC--- 443322
***************999************6..7**********98...4...222222222222222222222222... PP
v v v v v v v NC
,,,,,,,,,,,,,,,,,,,,<<<<<<<<----<<<<<__>>>>>-->>>>>>>>,,<<<<<<-<<<<<<___________ CS
U2 93 UAaauuaAUUUUUggaacuaGugggggcauuuuggGCUUGCccauugcccccaCacggguugaccuggcaUUGCAcUacc 172
:U : :::+U U UU+ ::: : A:A+ ::: G+C: : :A U CAC AC
ENA|BK006949|BK006949.2 443321 --------------------AUUGAAUAUUGU-----UUA------UAUAUGAUAUAUACCGUCAAAUUACUUCACGAC- 443274
....................222222222222.....221......3333345599***********************. PP
v v v NC
>>>>>>-->>>>>>::::::: CS
U2 173 gccagguucagcccAcccuuu 193
: : :G++C ::: + U+
ENA|BK006949|BK006949.2 443273 AGUGUGAACUGUGAUAAAUCA 443253
********************* PP
>> ENA|BK006939|BK006939.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome V, complete sequence.
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(5) ? 7.1 14.9 0.0 cm 1 193 [] 190882 191043 + .. 0.92 no 0.41
v v NC
::::::<<<.-<<<<____>>>>->>>,,,,,,,,,,,,,,,,,,,,~~~~~~~~~~~~,,,,,,,,,,,,,,,,,,,,, CS
U2 1 AUacCUUCu.cgGCcUUUUgGCuaaGAUCAAGUGUAGUAUCUGUUCU*[20]**[18]*aauuauaUUAaauuaAUUUUU 105
AU CC U : +: C: UUU:G : : AUCA AA ++U+UUAAAU AA UUUU
ENA|BK006939|BK006939.2 190882 AUUCCAUGAuUUCCUGUUUAGCU-UCAUCA-----------------*[ 4]**[ 4]*AACAUUUUUAAAUGAAAUUUU 190939
*******9955678899999976.799997....................9.....9..77699**************** PP
vv v v vv v NC
,,,,,,,<<<<<<<<----.<<<<<_._>>>>>-.........->>>>>>>>,,<<<<<<-<<<<<<__________... CS
U2 106 ggaacuaGugggggcauuu.uggGCU.UGCccau.........ugcccccaCacggguugaccuggcaUUGCAcUac... 171
+ + +GU:: : AUU :G:GCU UGC:C: + : ::ACA+ :: :: : :::: U CAC AC
ENA|BK006939|BK006939.2 190940 AAUGUCUGUUUCCUUAUUGaAGAGCUuUGCUCUGgauuuuccaACAUUAAACAUGCCGCCGAGGCCUCCUCCACCACcac 191019
***********9999998899999964799999999999999999999**************************999888 PP
v NC
.._>>>>>>-->>>>>>::::::: CS
U2 172 ..cgccagguucagcccAcccuuu 193
+:::: :UU:: :: + CU+U
ENA|BK006939|BK006939.2 191020 caUUGGCAUUUGGUGGUGAACUAU 191043
988********************* PP
Internal CM pipeline statistics summary:
----------------------------------------
Query model(s): 1 (193 consensus positions)
Target sequences: 16 (24142652 residues searched)
Target sequences re-searched for truncated hits: 16 (15424 residues re-searched)
Windows passing local HMM SSV filter: 72732 (0.7978); expected (0.35)
Windows passing local HMM Viterbi filter: 24175 (0.3233); expected (0.15)
Windows passing local HMM Viterbi bias filter: 6671 (0.09669); expected (0.15)
Windows passing local HMM Forward filter: 2037 (0.03161); expected (0.003)
Windows passing local HMM Forward bias filter: 1133 (0.01757); expected (0.003)
Windows passing glocal HMM Forward filter: 596 (0.01251); expected (0.003)
Windows passing glocal HMM Forward bias filter: 438 (0.009175); expected (0.003)
Envelopes passing glocal HMM envelope defn filter: 460 (0.00429); expected (0.003)
Envelopes passing local CM CYK filter: 38 (0.000201); expected (0.0001)
Total CM hits reported: 5 (3.063e-05); includes 0 truncated hit(s)
# CPU time: 67.17u 2.23s 00:01:09.40 Elapsed: 00:00:03.24
//
"""
self.check_raw(filename, "U2", raw)
class Hmmer3TextIndexCases(CheckIndex):
fmt = "infernal-text"
def test_infernal_text_1q_0m(self):
"""Test infernal-text indexing, cmsearch, one queries, no hits"""
filename = os.path.join("Infernal", "cmsearch_114_IRES_Yeast.txt")
self.check_index(filename, self.fmt)
def test_infernal_text_1q_mm(self):
"""Test infernal-text indexing, cmsearch, one queries, multiple hits"""
filename = os.path.join("Infernal", "cmsearch_114_5S_Yeast.txt")
self.check_index(filename, self.fmt)
def test_infernal_text_mq(self):
"""Test infernal-text indexing, cmsearch, multiple queries"""
filename = os.path.join("Infernal", "cmsearch_114_IRES_5S_U2_Yeast.txt")
self.check_index(filename, self.fmt)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)