mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
Add SeqIO support for GFA files. (#4598)
* Adds tags from segment lines into the annotations dictionary of the SeqRecord, handles undefined sequences and sets their length according to LN tag, raises a warning on incorrect tag types, and adds more links to the format documentation. * Check for valid tag name and store tag types in annotations * Tag value may contain colons * Update NEWS.rst
This commit is contained in:
177
Bio/SeqIO/GfaIO.py
Normal file
177
Bio/SeqIO/GfaIO.py
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
"""Bio.SeqIO support for the Graphical Fragment Assembly format.
|
||||||
|
|
||||||
|
This format is output by many assemblers and includes linkage information for
|
||||||
|
how the different sequences fit together, however, we just care about the
|
||||||
|
segment (sequence) information.
|
||||||
|
|
||||||
|
Documentation:
|
||||||
|
- Version 1.x: https://gfa-spec.github.io/GFA-spec/GFA1.html
|
||||||
|
- Version 2.0: https://gfa-spec.github.io/GFA-spec/GFA2.html
|
||||||
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
|
||||||
|
from Bio import BiopythonWarning
|
||||||
|
from Bio.File import as_handle
|
||||||
|
from Bio.Seq import Seq, _UndefinedSequenceData
|
||||||
|
from Bio.SeqRecord import SeqRecord
|
||||||
|
|
||||||
|
|
||||||
|
def _check_tags(seq, tags):
|
||||||
|
"""Check a segment line's tags for inconsistencies (PRIVATE)."""
|
||||||
|
for tag in tags:
|
||||||
|
if tag[:2] == "LN":
|
||||||
|
# Sequence length
|
||||||
|
if len(seq) == 0:
|
||||||
|
# No sequence data, set the sequence length
|
||||||
|
seq._data = _UndefinedSequenceData(int(tag[5:]))
|
||||||
|
elif int(tag[5:]) != len(seq):
|
||||||
|
warnings.warn(
|
||||||
|
f"Segment line has incorrect length. Expected {tag[5:]} but got {len(seq)}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
elif tag[:2] == "SH":
|
||||||
|
# SHA256 checksum
|
||||||
|
checksum = hashlib.sha256(str(seq).encode()).hexdigest()
|
||||||
|
if checksum.upper() != tag[5:]:
|
||||||
|
warnings.warn(
|
||||||
|
f"Segment line has incorrect checksum. Expected {tag[5:]} but got {checksum}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _tags_to_annotations(tags):
|
||||||
|
"""Build an annotations dictionary from a list of tags (PRIVATE)."""
|
||||||
|
annotations = {}
|
||||||
|
for tag in tags:
|
||||||
|
parts = tag.split(":")
|
||||||
|
if len(parts) < 3:
|
||||||
|
raise ValueError(f"Segment line has invalid tag: {tag}.")
|
||||||
|
if re.fullmatch(r"[A-Za-z][A-Za-z0-9]", parts[0]) is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has invalid name: {parts[0]}. Are they tab delimited?",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
parts[2] = ":".join(parts[2:]) # tag value may contain : characters
|
||||||
|
annotations[parts[0]] = (parts[1], parts[2])
|
||||||
|
|
||||||
|
# Check type of the tag and raise warning on a mismatch. These RegExs
|
||||||
|
# are part of the 1.0 standard.
|
||||||
|
if parts[1] not in "AifZJHB":
|
||||||
|
warnings.warn(f"Tag has invalid type: {parts[1]}", BiopythonWarning)
|
||||||
|
elif parts[1] == "A" and re.fullmatch(r"[!-~]", parts[2]) is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has incorrect type. Expected printable character, got {parts[2]}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
elif parts[1] == "i" and re.fullmatch(r"[-+]?[0-9]+", parts[2]) is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has incorrect type. Expected signed integer, got {parts[2]}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
parts[1] == "f"
|
||||||
|
and re.fullmatch(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", parts[2])
|
||||||
|
is None
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has incorrect type. Expected float, got {parts[2]}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
elif parts[1] == "Z" and re.fullmatch(r"[ !-~]+", parts[2]) is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has incorrect type. Expected printable string, got {parts[2]}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
elif parts[1] == "J" and re.fullmatch(r"[ !-~]+", parts[2]) is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has incorrect type. Expected JSON excluding new-line and tab characters, got {parts[2]}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
elif parts[1] == "H" and re.fullmatch(r"[0-9A-F]+", parts[2]) is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has incorrect type. Expected byte array in hex format, got {parts[2]}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
parts[1] == "B"
|
||||||
|
and re.fullmatch(
|
||||||
|
r"[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+", parts[2]
|
||||||
|
)
|
||||||
|
is None
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
f"Tag has incorrect type. Expected array of integers or floats, got {parts[2]}.",
|
||||||
|
BiopythonWarning,
|
||||||
|
)
|
||||||
|
return annotations
|
||||||
|
|
||||||
|
|
||||||
|
def Gfa1Iterator(source):
|
||||||
|
"""Parser for GFA 1.x files.
|
||||||
|
|
||||||
|
Documentation: https://gfa-spec.github.io/GFA-spec/GFA1.html
|
||||||
|
"""
|
||||||
|
with as_handle(source) as handle:
|
||||||
|
for line in handle:
|
||||||
|
if line == "\n":
|
||||||
|
warnings.warn("GFA data has a blank line.", BiopythonWarning)
|
||||||
|
continue
|
||||||
|
|
||||||
|
fields = line.strip("\n").split("\t")
|
||||||
|
if fields[0] != "S":
|
||||||
|
continue
|
||||||
|
if len(fields) < 3:
|
||||||
|
raise ValueError(
|
||||||
|
f"Segment line must have name and sequence fields: {line}."
|
||||||
|
)
|
||||||
|
|
||||||
|
if fields[2] == "*":
|
||||||
|
seq = Seq(None, length=0)
|
||||||
|
else:
|
||||||
|
seq = Seq(fields[2])
|
||||||
|
|
||||||
|
tags = fields[3:]
|
||||||
|
_check_tags(seq, tags)
|
||||||
|
annotations = _tags_to_annotations(tags)
|
||||||
|
|
||||||
|
yield SeqRecord(seq, id=fields[1], name=fields[1], annotations=annotations)
|
||||||
|
|
||||||
|
|
||||||
|
def Gfa2Iterator(source):
|
||||||
|
"""Parser for GFA 2.0 files.
|
||||||
|
|
||||||
|
Documentation for version 2: https://gfa-spec.github.io/GFA-spec/GFA2.html
|
||||||
|
"""
|
||||||
|
with as_handle(source) as handle:
|
||||||
|
for line in handle:
|
||||||
|
if line == "\n":
|
||||||
|
warnings.warn("GFA data has a blank line.", BiopythonWarning)
|
||||||
|
continue
|
||||||
|
|
||||||
|
fields = line.strip("\n").split("\t")
|
||||||
|
if fields[0] != "S":
|
||||||
|
continue
|
||||||
|
if len(fields) < 4:
|
||||||
|
raise ValueError(
|
||||||
|
f"Segment line must have name, length, and sequence fields: {line}."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
int(fields[2])
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(
|
||||||
|
f"Segment line must have an integer length: {line}."
|
||||||
|
) from None
|
||||||
|
|
||||||
|
if fields[3] == "*":
|
||||||
|
seq = Seq(None, length=0)
|
||||||
|
else:
|
||||||
|
seq = Seq(fields[3])
|
||||||
|
|
||||||
|
tags = fields[4:]
|
||||||
|
_check_tags(seq, tags)
|
||||||
|
annotations = _tags_to_annotations(tags)
|
||||||
|
|
||||||
|
yield SeqRecord(seq, id=fields[1], name=fields[1], annotations=annotations)
|
@ -291,6 +291,10 @@ names are also used in Bio.AlignIO and include the following:
|
|||||||
- gck - Gene Construction Kit's format.
|
- gck - Gene Construction Kit's format.
|
||||||
- genbank - The GenBank or GenPept flat file format.
|
- genbank - The GenBank or GenPept flat file format.
|
||||||
- gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
|
- gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
|
||||||
|
- gfa1 - Graphical Fragment Assemblyv versions 1.x. Only segment lines
|
||||||
|
are parsed and all linkage information is ignored.
|
||||||
|
- gfa2 - Graphical Fragement Assembly version 2.0. Only segment lines are
|
||||||
|
parsed and all linkage information is ignored.
|
||||||
- ig - The IntelliGenetics file format, apparently the same as the
|
- ig - The IntelliGenetics file format, apparently the same as the
|
||||||
MASE alignment format.
|
MASE alignment format.
|
||||||
- imgt - An EMBL like format from IMGT where the feature tables are more
|
- imgt - An EMBL like format from IMGT where the feature tables are more
|
||||||
@ -380,6 +384,7 @@ from Bio.SeqIO import AbiIO
|
|||||||
from Bio.SeqIO import AceIO
|
from Bio.SeqIO import AceIO
|
||||||
from Bio.SeqIO import FastaIO
|
from Bio.SeqIO import FastaIO
|
||||||
from Bio.SeqIO import GckIO
|
from Bio.SeqIO import GckIO
|
||||||
|
from Bio.SeqIO import GfaIO
|
||||||
from Bio.SeqIO import IgIO # IntelliGenetics or MASE format
|
from Bio.SeqIO import IgIO # IntelliGenetics or MASE format
|
||||||
from Bio.SeqIO import InsdcIO # EMBL and GenBank
|
from Bio.SeqIO import InsdcIO # EMBL and GenBank
|
||||||
from Bio.SeqIO import NibIO
|
from Bio.SeqIO import NibIO
|
||||||
@ -420,6 +425,8 @@ _FormatToIterator = {
|
|||||||
"gck": GckIO.GckIterator,
|
"gck": GckIO.GckIterator,
|
||||||
"genbank": InsdcIO.GenBankIterator,
|
"genbank": InsdcIO.GenBankIterator,
|
||||||
"genbank-cds": InsdcIO.GenBankCdsFeatureIterator,
|
"genbank-cds": InsdcIO.GenBankCdsFeatureIterator,
|
||||||
|
"gfa1": GfaIO.Gfa1Iterator,
|
||||||
|
"gfa2": GfaIO.Gfa2Iterator,
|
||||||
"imgt": InsdcIO.ImgtIterator,
|
"imgt": InsdcIO.ImgtIterator,
|
||||||
"nib": NibIO.NibIterator,
|
"nib": NibIO.NibIterator,
|
||||||
"cif-seqres": PdbIO.CifSeqresIterator,
|
"cif-seqres": PdbIO.CifSeqresIterator,
|
||||||
|
6
NEWS.rst
6
NEWS.rst
@ -63,11 +63,17 @@ The wwPDB organization announced that they plan to deprecate the FTP
|
|||||||
server by the end of the year. See the announcement
|
server by the end of the year. See the announcement
|
||||||
`here <https://www.wwpdb.org/news/news?year=2023#65562f0ad78e004e766a96c1>`_.
|
`here <https://www.wwpdb.org/news/news?year=2023#65562f0ad78e004e766a96c1>`_.
|
||||||
|
|
||||||
|
``Bio.SeqIO`` now supports reading sequences from the Graphical Fragment
|
||||||
|
Assembly (GFA) files with the formats ``gfa1`` and ``gfa2`` (for GFA 1.x and
|
||||||
|
GFA 2.0 files respectively). All data outside of segment lines are ignored, such
|
||||||
|
as linkage information.
|
||||||
|
|
||||||
Many thanks to the Biopython developers and community for making this release
|
Many thanks to the Biopython developers and community for making this release
|
||||||
possible, especially the following contributors:
|
possible, especially the following contributors:
|
||||||
|
|
||||||
- Anil Tuncel (first contribution)
|
- Anil Tuncel (first contribution)
|
||||||
- Fabio Zanini (first contribution)
|
- Fabio Zanini (first contribution)
|
||||||
|
- Michael M. (first contribution)
|
||||||
- Michiel de Hoon
|
- Michiel de Hoon
|
||||||
- Peter Cock
|
- Peter Cock
|
||||||
- Will Tyler (first contribution)
|
- Will Tyler (first contribution)
|
||||||
|
1
Tests/GFA/corrupt_checksum.gfa
Normal file
1
Tests/GFA/corrupt_checksum.gfa
Normal file
@ -0,0 +1 @@
|
|||||||
|
S fake AAA SH:H:FFFFFFFFFFFFFFF69566510EE712661F9F14B83385006EF92AEC47F523A38358
|
1
Tests/GFA/corrupt_len.gfa
Normal file
1
Tests/GFA/corrupt_len.gfa
Normal file
@ -0,0 +1 @@
|
|||||||
|
S fake AAA LN:i:2
|
1
Tests/GFA/corrupt_segment_fields.gfa
Normal file
1
Tests/GFA/corrupt_segment_fields.gfa
Normal file
@ -0,0 +1 @@
|
|||||||
|
S fake
|
1
Tests/GFA/corrupt_tag_name.gfa
Normal file
1
Tests/GFA/corrupt_tag_name.gfa
Normal file
@ -0,0 +1 @@
|
|||||||
|
S fake AAA ~~:i:0
|
1
Tests/GFA/corrupt_tag_type.gfa
Normal file
1
Tests/GFA/corrupt_tag_type.gfa
Normal file
@ -0,0 +1 @@
|
|||||||
|
S fake AAA AB:i:C
|
1
Tests/GFA/fake_gfa2.gfa
Normal file
1
Tests/GFA/fake_gfa2.gfa
Normal file
@ -0,0 +1 @@
|
|||||||
|
S fake 3 AAA
|
1
Tests/GFA/fake_with_checksum.gfa
Normal file
1
Tests/GFA/fake_with_checksum.gfa
Normal file
@ -0,0 +1 @@
|
|||||||
|
S fake AAA SH:H:CB1AD2119D8FAFB69566510EE712661F9F14B83385006EF92AEC47F523A38358
|
21
Tests/GFA/no_seq.gfa
Normal file
21
Tests/GFA/no_seq.gfa
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
S 232 * LN:i:528 KC:i:51170
|
||||||
|
S 277 * LN:i:893 KC:i:50561
|
||||||
|
S 280 * LN:i:895 KC:i:37634
|
||||||
|
S 282 * LN:i:1819 KC:i:106341
|
||||||
|
S 283 * LN:i:1854 KC:i:82138
|
||||||
|
S 289 * LN:i:163 KC:i:14624
|
||||||
|
S 297 * LN:i:4149 KC:i:248525
|
||||||
|
S 333 * LN:i:4399 KC:i:191002
|
||||||
|
S 6 * LN:i:89 KC:i:9779
|
||||||
|
L 6 + 277 - 81M
|
||||||
|
L 6 + 280 + 81M
|
||||||
|
L 232 + 277 + 81M
|
||||||
|
L 280 + 232 - 81M
|
||||||
|
L 282 + 6 + 81M
|
||||||
|
L 283 + 6 + 81M
|
||||||
|
L 289 + 282 + 81M
|
||||||
|
L 289 + 283 + 81M
|
||||||
|
L 297 - 232 + 81M
|
||||||
|
L 297 + 289 + 81M
|
||||||
|
L 333 - 232 + 81M
|
||||||
|
L 333 + 289 + 81M
|
19
Tests/GFA/seq.gfa
Normal file
19
Tests/GFA/seq.gfa
Normal file
File diff suppressed because one or more lines are too long
21
Tests/GFA/seq_with_len.gfa
Normal file
21
Tests/GFA/seq_with_len.gfa
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
S 232 CCTTATACGAAGCCCAGGTTAATCCTGGGCTTTTTGTTGAATCTGATCATTGGTAGCAAACAGATCAGGATTGGTAATTTTGATGTTTTCCTGACAACTCCTGCAAAGCATCAGCCCAGCAAAAAGTTGTACATGTTCCGTTGATTCACAGAAGGCACATGGCTTAGGAAAAGAGATGATATTGGCTGAATTGACTAATTCTGTTGACATAAGAAGTAACCTTGGATTGTACATATTTATTTTTAATAAATTTCAATACTTTATACCTAATTTAGCCACTAAAATTTGTACATTATTGTATAATCCATGTGTACATATTATTTTTTATAATTTTCATTCACTTAGACCCAAAATAGTATTTATTTTTGTACAACACCATGTACAGAACAATAATCATATAAATCAAATTTTTAGCATAAAAAAGTCCATTAATTTTGTACACAATTCTGAAACTTAAAAATCTAAACTTTCATCAATTTTTTTCATAATTTCAATAAATTAACCTTAATTTTAAGATATATTCTGAAA LN:i:528 RC:i:51170
|
||||||
|
S 277 TGAAACTTAAAAATCTAAACTTTCATCAATTTTTTTCATAATTTCAATAAATTAACCTTAATTTTAAGATATATTCTGAAATTTGGTTTGAAAGCTGTTTTTACATTATATTTCAATACTTTAAATCAAAAAATTGGATATTTTTTGAAAAACTCAATGAAAGTTTATTTTTTATTTAAAAACAACTAGTTATATTAGTTTTTATCCATTTTTTTGAAACAGTTTCTATATGATAAAAAAACCTATAAAAACCATATCTAGCAAAGGTTTGAGGGTTATCATCTTTAGATGCGTGGTGTGTGACAAAAAAATCCCGGCATGTGCCGGATTCTGGATTAGAAAATTGGCTAAAGTGACGTAGGACTGGTGCTTGGTTTTACATGGAAAAAAGTATTTATTTTCTGGTTTATAAAAACGTAAAAAGATCAGTTTTTGTTCATTCATCCAGGTTAAAAATTTCAACCTAAAACTTTAATTATGAAAAGCTTCACAGAAAGCATTCAAATGCGATTTAAGAGCCTTTATCTAAAAAACATAGATCTTATAGCGAAAAACAGAAAACAGCTCAAAAAACGCAAAAGAGAGTGAAGTAAAGAGATGTTTTGACTTTAGATAGCTGCATAGAGCGAGTGTCTACGAGCGAACTATCAAAATTTGCGTCTAGACTCTCTGAAAAACATTTTTTTGCCCTCTTTAGCCTAAGAAAGCTTAATTTTCATGCAGAAATTTGCTCCTGGACCGAGCGTAGCGAGAAAAAAAGCTCATGAGCGAAGCGAATTCCGAGTTGCTTTTGCTTTTTCTTAAAGTCACGCAAGTATTAACCAAAAAATTGCCCCGACGAACTGAGCGAAAGCGAAGTTCAATAGAGTTTGAGCGAAGCGAAAACCAAGGGC LN:i:893 RC:i:50561
|
||||||
|
S 280 GCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTTACGTGATTTTAAGAAAAAGCAAAAACAACTCGGAATTCGCTTCGCTCATAAAACTTTTTTACTCGCTACGCTCGGTCTAGGAACAAATTTCTGCATAAAACTTACGCTTTCTTAGGCTAAAAAGCCCCAAAAACTGTTTTTCAGAAAGGCTGAGAGGCAAATTTTATAAGTTCGCTCGTAGACACTCGCTCTATGAATCTATCTAGAATCAAAATCCCTCTTTATTTCGCGCTCATTTGCGTTTTTTGAGCTGTTTTCTGTTTTTCACTATAAGACCTATGTTTTTTACTTAAAGGCTCTTAAATCGCATTTGAATGCTTTCTGTGAAAATTTCCATAATTAAAATTTTAGTTTGAAATTTTTAATTTGGATGAACGGACAAAAAATGATTTTTTTACGTTTTTCTAAACCAAGAAATAAATACTTTTTTCTATATAAAACCAAGCACCAGTCCTACGTCACTTTAGCCAATTTTCTAATCAAGGATCCGGCACATGCCGGGATTTTTTTGTCACACACCACGCATCTAAAGATGATAACCCTCAAACCTTTGCTAGTTATGGTTTTTATAGGTTTTTTTATCATATAGAAACTGTTTCAAAAAAATGGATAAAAAATAATATAACTAGTTGTTTTTAAATAAAAAATAAACTCTCATTGAGTTTTTCAAAAAATATCCAATTTTTTGATTTAAAGCATTGAAATATAATGTAAAAACAACTTTCAAACCAACTTTCAGAATATATCTTAAAATTAAGGTTAATTTATTGAAATTATGAAAAAAATTGATGAAAGTTTAGATTTTTAAGTTTCA LN:i:895 RC:i:37634
|
||||||
|
S 282 AGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCATGTTTAGGATCATCAATATTAGGTTCTGGCTGTACTTGAGCTACATTCGCTTGCGACTGTTCTTGATCGGTAAACGTAGTCATATTGGCTTTTGGTGCTTCTAGTAAGCGCTGCATGGCCTCGATCTGATCTTGATAAAACGATTCACGTTCTAGTGATTGATTTTCTCTAAACAGTGCTTGATTTAACTGTTTTTCCAGTATGTCAACTTGACGTTTTAATAGGTCAACTTCTGTCAAGTTTTGACTGTTAATTGATTGACTTTGATTGACATTAGTATCTTTCTTTTGTGGTTCCCCAAAAACTCTTATGGCTTCTGCGAAGTCAATTAATCCATCAGATCCTTTAGATAAATTTCCTTTGTTTATATGGGCATATATGGCTTGTCGTGAATATCCGTAAAGCTTAGCCAACTCTGAAACTGACAGTTTTTTCATTATGTAAACCAGTTTTCAATTTGTGTTAATACTTGACTGTCAACTTTACAATGTCAATTGACATCATTTATTGAAAGCCAACTTTTCGAAGTAATGGAGTAAGTTCTTTTAATTTCTCCGGCTGTTTTAGCATATCTGCAATCCGTACAGCAAATTGTTCATAGCTTTCGGTGCCTTGTGAAAGTTTTCCCATTTCTGGAAGCTCTGACAATTTACTCGCAAATGAATAGCGTTGTGCATCTGTTAAAGTGATCGTTATATCTTCGCTTACACCTATACTTTTAGCTGACTTCGTTACTGATTTTTTTTGTTTAAAGCTAAATGAAAAACCTGTAATTGATCTACCAGTCTTATGCTGTTCAACTTTAACAGTAATGTCGGTATGTTCATTTACTTGCTTTAATGCAATGTCTAAGACATATTTTTTAAAGTCATACATTCGTTTGTATTCAGTATCGAGTACACCTATTTTTTGTCTGAAATCAGACAAAGTTATAAGAGGCGTTTTTCCTGTACTACGCCATGCAATCAATATCTCATATAACCGAACGGCATAAGCACTTGTTAAATTGCTTATTTGTTGTATTTCATACTTTGTAAATTGTTCTTCTAGTCTTGTAATTAAAGGCACGATAGCGGGAGCAAAAATAAGTCTAACAACCGCTTCATTATCAATATAAGCGACTTCACTCACCCATCTTGACTTGTGATTAATAATGTTGCCTTTTTCACTAAGACTTTGATAACTGAATTGTCTTGCAAATAGATCATCACAAGCATCTTTTAATGCCTGATAAGCCGTGTTTCGATGTACACCAAATTGATTGATGTAGCTTTCAGCATGAACTGTTAATGGATCATTAGCATTTATTCCCTTACCCGATTCCCTTGCTTCAACGATAGCTAGAAGAATTAACCGTTGCTCGACTAAATCAAGGTTATAACTGGCATTAATTAATGCATTATCTTTAACTATTAGTTCTGTTTTCATATGAAAAGGTTATATTTTTAATTCAATAAGACGAGGCTATATCATAAACTCGCTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTATAGTCTCTAAGAATTTTATAAATAAAGGCTTTCAGCATGTCTAAAAGCATTTAAAAGATTTAAAAATATTTAAAAGCTCAGGTCATGAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGT LN:i:1819 RC:i:106341
|
||||||
|
S 283 AGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCATCTTTAGGTGGAACAACTATCACCTGCTCCTTCATTTCTACAATGGGAGCTGATTGAGGGAGTTCGGGTTCACTAGGCTCTTGGATCTCAGGTTTAGGTGCTTCCAGTAGACGTTGCATTGTCTCAATTTGATTTTGATAAAAGGATTCACGCTCTTTTGCATCATCAAGCTGATTTTGCAGCATGTTTATCTGTTGTCTAAGTAGTAAAACTTCTGTCGATATTGGACTGTCTATTTTTACAGATTCTTTTACATCCTGTTTTTTTGAAGGTTCACCAAAAACACGGATAGCCTCGGCTAGATCAATTTTATTATCAGAATTTTTACTTAAAATTCCTTTCTTTATGTTGTTGTAAATCGTTTGTCTATTGATATTGTAAAGCTTAGAAAGCTCAAGAACTGTAAGGCTTTTCATGTTGTAAATCTATGTCTAAACCTGTCAAAAGTTGTTGTCTATTAGACTGCCTAATAGACAAATGATCAAGCATAAATTTTAGGCAGCTTGATAGCCAACTTTTTGAAGATAAGGAAATAATTCTTGGAATTTTTGATAATCTTGGAGCATTTCTGCAATGCGTATAGCAAATTGTTGGAAACTTTCGGTGCCTTCTGAATATTGCCCCATCTCAGGTAATTCAGAAAGTTTGTTGGCAAAGAGATGACGTTGTTTATCACTCATTTTGGTGAAAAGGGCTAATGTATCCGTTCCCTTAATTACTTTGTCAGAGTTGGTTTTTTTCTGCTTAAAACTGAAAGAAAAACCGGAAATTGAACGTCCTTTTTTATGTTGTTCATATTTAGCCTTTACATCGGTTAACTCATTTACTTGCTTGAGGGCAAAATCTAAAACATACGTTTTAAAATTACTCATTGTTTTGTATTGATGAGCTTCGACACCAAGTTGTTGTCTAAAGATCGATAGATCAAAAATTGGCGTTTTACCAGCAGATCTCCATTGAATTAATAACTCGTATAACCGAATAGCATAAGAGCTAGTTAATCTCGAAACCTGCTCCAATTCATATTTAGTAAAGTTTTCTTCCAGTCGAGTTATGAGTGGAACAATGTCTTGCGTGAAACGTAGGAAAACGATCCCGGATTGAGGTTCATATCCAATTTTATCGACCCAACGACAGTGAAAGCTACGATCCTTACCAGTCTTAGGATTAATGTCATGATACGTGACATAGCGATCAAACAAGCTTTTAGATGCATCCCTAAGCACGGTATAAGCAGTGTGTTTCTCGACATTAAAGGTATTGATATAACTACTTGCATGTACTTCTAAAAGGCTATTTTCAGTTATCCCGTGCCCTGTTTCCCGAGCTTCAGCAATGGCTAATAGGATCAGCCTTTGTTCAACTGTATCTAGGGTATAGCTAGCTTGAATTAAAGCATTATCCTTTACGATTAATTCACTCATTTCTAATAATAAATTTATAATTAAACTAGGTTGTACAATAGACTATTTTAATTGAATTGTAAATCGACTTAATTTGGGGTCGATAAACCTAGTTTTAAGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGGTTTATTAACTCTAAAACTATTATGAATAAAGGGTTACAGCTTACTTAAAAGCATTTAAAAGATTAAAAATAATTAAAAGCCTAGGTAAAGAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGT LN:i:1854 RC:i:82138
|
||||||
|
S 289 CACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTCCAGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCAT LN:i:163 RC:i:14624
|
||||||
|
S 297 AAAATTACCAATCCTGATCTGTTTGCTACCAATGATCAGATTCAACAAAAAGCCCAGGATTAACCTGGGCTTCGTATAAGGTGTATTATGTTAATTTTAGAAATTATTAAATGATTCCAAGATTTTCTAGCGACTTATAAGTAATTTCATTACGAATAGAACCAGACATTCCTTCTTTCATTTCTAAGTTGAGCGAAAAGGGGATTTTTTTTCCATTAGCTTGCTCCACCCAACCAGTCAACCAACCTACCTGTGGAGTAACATCCATTCCCCATCCACTTTTTGCATAAATCTTACTACCATTTACTTCTTTAATTAGAAGCATTTTTTTAACTTCTTCTTGAGTTTCTAATTTAAAAGGTAATCGGTTATGTGCAAGGTCATCGGCAAAATTAACTTCTTGTACTGGTGTAATTTTAAGGGGGCCAACTAACCAAAAATTATCGACCTGTGTTCCAATATTTGTATTTCCAAAATTAACCCGCTTTACTTCTTTCTGCATTAGCTCTAGGCCAGTCCGTCTTGCAAGCTCTTGATATACTGGAACTGCTGACAATGCCATTGCCTCACCTAAAGTCATATCTTTCTCCCACATAGGATAAGTTCTTTTTTTACCATCCCATTTGAAAATCTCATTTGTTGTTGCTTTATGATTTTCTAGCCCGATTAAAGCATTTAGCATCTTAAATGTTGATGCAGGGACATATTCTTTATTTGCTCGTGCAAGAGCATTACCATAGGTGCTAAGATTTTTACCCTCTTTAATAATAATTACACCCTGTGTTTGAGCTTCATCAAAATAGCTTTTAATAGCTTTTTCATGTTGCTGAGAAGAAATATGAAAATTATCTTCAGATTTAGTTTTAATAGATGAACATGCACTGAGAGAAACTAGAATAGAAATGCTGAATATAGGAAGTATAAATTTTTTCATTACAAATTCATGTTAGGGGAAATTTTGGGGCTTAGAGCATAATATTTTTATTAAAAAAGTTAAAACTCTTTTAATTTAACATAATGGGCGTTATCCGAAATGGCCCTTTAAGTTTACTTTAAGATCAACAACTTACATTGAGTTAAACCCCTGGAACGATCTCGTTCAACCGATTTAGAAATAAATCGGTTTTTTTATGTTCAATTCTGATACTTCTTGCCAATAAAATTGGTTAAAAAATGATCAAAAATACCAATTCCATAGAGTTTGTAAGCCTTTTTTACCCAAAAGAATCCACAATTTCTGCGGATAACCATTGGGATAAATTTTGATTATTTTTTGTACGTTTTGGCTTACAGCGTTTCACGAAATTGTCTGTATCTATCCCCCGATGAAGCTCTTATGATCGATCTATATGGACAGGATGTTCAATGACTTAAAACAACAATAATAATCAGTCAGGCACCATGACCATCAGGTAGTCAGCCCTACCTATGCGCTCCATACCGAAAAATCCCGACATGACGTCGGGATTTTTTATTGCCATTTGCAAAGTCATTCAATCAGATTTAATATATAGACTGTAATCTATATTGTGATTGTAATATGTGGACAGTCATTACAACGGATCTGTTTAATGAATGGCTTGAACAGCAAGATGAAGCAACACAAGAGAAAGTCTTAGCTGCATTGGTTGTTCTACAACAGCAAGGGCCTAGTTTAGGTCGTCCTTTAGTAGATACCGTCTACGATTCTAAATTTACCAATATGAAGGAACTACGTGTTCAGCATCGTGGTAAACCGTTAAGGGCATTTTTTGCATTCGATCCCCTAAGACAAGCGATTGTGCTGTGTATTGGTGATAAGAGTAATAAAAAGCGTTTTTATACAGAGATGCTAGCAATTGCAGATGAGCAATACGCACTTCATCTCTCAACTTTAGGAGATCAATCTAATGGCTAAGAGCTTACAAGAATTATTGGCTAGTCGCTCTCCAGAGAGTCAAGCTCGTATTCAAAAAATGGCTGATGAATTACTTTTAGAAAGTCAGCTTCACCTTATCCGAGAAGAGTTAGAAATTTCCCAGAAAGAACTAGCTGCAACCCTTGGAATTAAACAGCCGTCATTATCAGCAATCGAAAATCGTGGTCATGATCTTAAAATTTCAACGATGAAAAAGTATGTTGAAGCAATGGGTGGAAAACTTCGTATTGATGTAGAGCTTCCAACAGGAAAACACATAGGGTTCAATGTCTAATAATAAACCTTTGAAAAATAGTACTAAATTACTTGTTAACTTAGACAAGTTTATATTTTTAGTCAATGCTGCCGATAGCCTAGAAGAAATTGAAATTATTAGAGATTTATGCTGTGAATATTTTTCACATTGTAAGAGGCCAAGCTATTACATTGATATTTTTGACAATGCTTATTGGATAAAATATTACGAAATAGCTTAATCCTTCTAGATTCACATAGGAGTTTCTATGTTAAATAGGACTAAAAATATACTCTAATTTTAGAGTTTTCTTTTAGGTATGATGTAAAAACATACAAGCCTAAGAGTTTAATTTAAAGGTTAAGAATTGAATAAATTAGTAAATATTTTTATGGCTTTAGGAGTTTGCCTTTCAATGGAAACATTTGCAGAACCTGTTTTTTCTAATGATTTATTAGCGAAAGCAGAAAGTGGAGATACTTCTGCCCAGTTAGAATTAGCTGAGATTTATCTATATGGTCATGGTGTTGATTCAGATGAAAATCAAGCTGAAATTTGGGCTATTAAATCAGCTGAAAATGGAAATGTAGCAGCAATGTTTTGGTTAGCTGATGGATATGTTACCTATGCTAGATTAATAGAGGATGATGACAAAAACGATTCTTTAGAACATTTCCAAAAAGCTTTTAAGTGGTTTCAGAAAGCTTCAGAAAATGGTCATTCTGAATCAATGGTTGAGTTAGCTGACCTATATACTCGCGCAGCTAGCGGAATAGAAGTTAATATTAAGAAAGCTTTAGAACTTCGTGAAAAGGCTGCAAAGTTAGGTAATAAGAAAGCAATGCGAAGTCTTTCCGTTATGTATCGTGATGGTATAGGCATTCCTAAAAATACTGATTTAGCTCAAAGTTGGTGGGATAAGTCTGAAAATTAATTTTAGTAAAAGTTAAGGATTTTTTAAGAGTCTAATAAATGAATAGTGTTCATGAAATTATAGAAAAAATACATAATGAATGGGAGATTGAACCTAAGAAAGCTATTCAGAGAGGTATGGAATGTCCCTTTCCTTTGCAGTGTTCATTAAACCTAAAAAGCAAAATTTACCCACAAATACCCCAAGTACTTTTACCAAAAGTATTAGAGGATTTTTATACAGTATCTAATGGTGCAGATCTATTTAAAGATCAAGAATACGGACAATGGGGCTTAAAATTATATTCTATTGAGGAAGTAATTTTTGCTTCAAAAATTTATAAAAGTAATCGTAATAATGACGCTCTTCAAAGTGACTTAATTATTGGAGAATTCTATGGAGATTCAGACTTATTGTTAGTTCGATGTGATCCTAATAGTGATGATTACGGCTCTATTTTTGTTGTCTTACCTATAGATCAAAGACAAGATTGGTACATTGTTGCTAATACTTTTCAAGAATTCATTAGTAAATTTTATGAAACTCAGGGTGATAAATTCTGGGAACCTTAAAATTATATGGCCGATATAAGCTCTATTTAACTTATGTTCACCTAAAATTTATTTAATTATCACAAAAATAGTATCATTTTTTGAATTTCATATAACGCCCATTATGTTAAATGGCTTAATCAAAATAACCTAAAATTGCTAAAGTAGGTATAAAAAAGATCGTAAATACAGCAAAATATTGAATCATCTTTTTATGTCTGAAATCTCGTTTAGAAATGATCAAAAGCAATGCCGAAAAAAAAATGACTATGAATGTCCATGAAATTGAATACATACTTAAATCCTAATTAATTATGTTGAGTTTTAAATTTCTCTAAAATTAACATAATACACCTTATACGAAATGCTTTTAATTTCTCTTGAAAATTCAACGCTGTAGTTTCTAGTTTATAGCCCATCTTCCCCAAGGTGGGTTTTTTTTATGATTTTTCACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTC LN:i:4149 RC:i:248525
|
||||||
|
S 333 AAAATTACCAATCCTGATCTGTTTGCTACCAATGATCAGATTCAACAAAAAGCCCAGGATTAACCTGGGCTTCGTATAAGGCGTATTATGTTAATTTTAATGGAACTTTATAATTAGACGTTGTTGTTGATATATAGCGAATGCCATCATGGTGGGCACTATATTTTATGTAGATTGAATACCCACAATCTAAATTACACCATTTTTCAGAGTACAGACTTTCTCTTACATCATCTGGGCTTCCCAATAGATCATGAATTTCAACCTTAGACATGTCAAAATTAAGTCCTTTAGGTAGAAGCTCTCTATATTTATTAAATCCTTCAACATTTTGATTATAAAAAAATATTGTTTGAAAAAATAAGTTTTTACCTAACATTGCTGTATTACTAAGATGATCTAAGCAACTAGCTTCATCTCGAAAAAGTAAACAAATACCTTTTTCTTCATTAGCTATATATAAAGTATCTTCATCTAAAGGTTCATTATCCTCATCTAATAAAGGAATATAAAGATTAGGTTCTGTAATTAATTGATTATTTAATAAAAAATAAATCAACTCCTTATCATGATTACTTGTACCCAAAAATTCTAAAAGCTTTTCGGTCATAATATTTTTCTATATTTTCGACTGACAAATTTTAGAAGAAATCCCATGTAAAAGTCATCTACATCGACCCATTTAACATAATAGAGCTTATGCGAAACCACTCTTAATTATTTTAATTAAATCAATTGTTTGTGAATTTTACATAACGCAAAAAAACACCAAAAAGCCGACTTAGAAACAAGTCGGCTGCTTTTTAAGCAAGTTTGTCATATTCTGCCAATAAAATCGGTTAAAAAATGCTCAAAATTTCAATTTTTCCTGATTTCTAAAGGCTCTCTTTTCTGAAATTATCCCCATTTCCTGCGGACAATTCTTGGGATAATTTGAGGATATTTCCTTGTACGTTCTGGCGTACAACAATTCAGGAAAATTGCCCCAATTGGTTGTGCGGTAGATTTTGTGTGTAGGCTTCTTCATTTGAAAATTATATCGCTGAAAAAGCCTTTACAGATAGGTTTGTGCAACAAAGCCTTTCATAAATAAAACTAAAATTTAATGTAAAAGCGTTGAGAATATCCTGCACCATTTTCCCATTTATCAGTCTCTATGTCTAAAGATAGGTTCTTTACTGTTGTCTGTTTATTCGTTTTGTTTTTTATAGCATTTTTTAAAACAGATGGGAGTTCTTGATTAGCAACTTCCTTGTATATGCTTTCAGACATTTTTATAAATTCAGTCTCTGCTGATTTCTTATATTTTGAGTCATTAACATTTAAAACAACTTTCAATTCTTTAATACTACTTTCAGTACCTTCTACATAATAAGCAAGATTATTTGGATTTTCTCCATTTTGAGGATCAGGAGAAATATCAATATATGAAGAAATACAACGATAGGTACCTTCTCCCATTGTGTCTTTCCAGCTAGTCATACCTGGTAAATTCAGCATAGAACATGCTTCACTCACCGATATACTAATGTCTTCTGCTTCTACGTTTCCTGTATCTTTACTTTCTGCTTCTACATTTCCTATATCTTTACTTTCTACTTCCACGCTTTTTTCCTCTACTTCTTTATTCTGAATCGAAGAATCACAAGCTGAAAAAAATAAAGGTAATAACATCACAGCAATAAATTTATTTCTCATAACAATCCTATAAGCTACTAATATTAATTAATTTTTAACTGTAATTTTCCAGTTACCACCAACATCTGCTAGTTCGTATCTATAATTGTAATTATTACACACTAAATGATATCCAACACTTCTAACAAAAGGTAAAGTTGATGTTACAGAATCACACTTATAACCATTTAATTTAACCATGACTGCCATTTGCTTAGCCCCTTGTACTCCTGTAGCTTTATCAATTTTGGCACCCGATTCAATTTTTACTGATTTCATCATCTCTTGTAATACTTTTTCTTGTAATGGAGTTGCAAAAGTATTTACTGAGCATAATGTAGAGATAGCTAATACAATTTTGATAGATAATTTCATGATTATTAAGTAATTAAAAAAATGATATATTATATATAGTTAAAAATTTAGCAATATTTATATTGGTAAGTCCTTTATGGTTCTAGGCGGTTGGCGGGTTTCGGGGGGCTTGTCCCCCTGAACAAGGTCACGATAGGTAACTTTGAAAATAACGCAGTTATTTGAAAAAGTACCTATTGTGTATCCTTGCTTATTTTTGAATAAGCACTAGTTTAATATAATATTTAAATAAATTCCTGTGTAGGTTTGTTTTTGGGTTTTAGTTAGTGAACTATTTTTTATAATTGAACAAGAAAAAGGAAATATAAAAAATATAGCTCAGGTATTTGAATATCACCATTGAAAAACAAGGCTTCACGGTATGGAGCCGCTGTCGCTTTCAAATCGGTATGCTTGGGTTGCGTTACAGTCCAAGTCACAAGAAAGTATTTAGAAGTCTGACTACTTCCCCTGCACACGCATTATCTCTCTACAACACATGCTCGTATCTTGTGCCGATCATCTCGTACATCTATTTACGATTACATGCGCTATTCCCTGCGTCCTTTACCTAGTGTAATACATCAGTGGTAAAGCGGTTTTAGTCTTTAATGCCTGCCGTCGCTTGTCCACCTTCACATTAAAGCCACCAAAGACTAAGCCGATATATAAAAATATCGCAGTACCCATTGATTCGCATAGTCTCTATTTGCCGCCACCTCATTCCCAGCAAAATCATTTTTAATTTTATTTGTTGTCACTGCTTTTTTAGGAGAACTTTCTACTATCTTTTCCCAGTCAAATAGTGCCTCACCACTGTCTATCCATGTAGTAAAATCATCATAAATCATAATATTTTTTATGGCTTCTTCTTTATTTGCAAAAGTAAATGATGATGTAGTTAATAATAATGTCGGGATCAGAAGGTATTTCATAAAAAGATACTCTAAAATTAATTTTAAGTTAGAATCTTATAAAACAATGACTTAATTAAAAGCAATTTTTTTGCTTTAGGGAATAGTGTATGTCTTTAATAAATTGTCCAGAATGCAACCATTCTGTTAGTGATAAAGCCTTAGATTGTCCATCTTGTGGGGCTAGATTAAGGAAACCTAAAAGAGGTTTCTTTGGTAAGATTTTTAAATGGCTCTTTATTCTTTTTAACCTTTTTATGCTAGTTATGACATTTAAATCTTGTGGAAGTGCTTCAGAAATTATTGCTTCTAGCCAAAATGAATATGAACAAGCAGGAGCTACATTAGGCTCTACATTAGGCTTAGGGATGGTGATAATTTTCTGGGCATTAGTAGATGTAATACTTGGATTATTTGTACTATTCACACGACCAAAGCGATAAAAATAGGGAGATATTCTCCTTATTTTTCAAATAGCTAAGCGTAAATTAAGGGCTTTAACAACTTTTATAACAGTGTCAAAACTAGGATTAACATCCCCAGATAGAGCTTTGTAAAGACTTTCTCTTCCTAAGCCAGTTTCTTTTGATAGTTGAGTCATACCTTTAGCTTTAGCAATGTTTCCTAGTGCTTTAGCGATAAAGGCTGCATCACCGTTGGATTCATCAATACATGCTTGTAAGTAAGCTTGCATATCCTCTTCTGTTTTGAGGTGCTCTGCACTATCCCACTTACGAAGTTTAATAGCCATTTATAGTTCCTCCTCTAGGTCTTTTGCTAGCTTCAAGGCGAGTTTTATATCTTTACTTTGGGTGGTTTTATCACCGCCTGCTAATAAAATAATAACTCTTTGTCCTTGTTGGCAATAATAAACTCTATAACCAGGACCGAAGAAAAAACGTAGTTCAGAAACACCTTCGCCCACAGGCTCAGTATCTCCAAAATTGCCATCTTCAACCCGATCGATACGAACTTGTATGCGTCTTTTTGCCTGTTGGTCTTTAAGTTTGTCAAACCAATCATCAAAGACTTCGGTGGTGTATATTGAGTACATAAGCAAAATGTATCCTATAGGATACATTTGTGCAAGACATAAACTTGGTTTTTTAGGCCTGAGGCACTGATGTTATTCCCAAGTAGAAAAGTCCTACCTAATAACTAAATAGAAATGTCCTATACAGGCATTTCCGGCCAGTGAGCCTATCAAAGTCCTCATGCATGAAGCCAAAGAAGAAATAATTTATTATTGAGTTTCGCTAAAATTAACATAATACACCTTATACGAAATGCGTTATAATTATCTATAGAAATCAATGCTGTAGTTTCTAGTTTATAGCCCATCTTCCCCAAGGTGGGTTTTTTTTATGATTTTACACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTC LN:i:4399 RC:i:191002
|
||||||
|
S 6 GAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTT LN:i:89 RC:i:9779
|
||||||
|
L 6 + 277 - 81M
|
||||||
|
L 6 + 280 + 81M
|
||||||
|
L 232 + 277 + 81M
|
||||||
|
L 280 + 232 - 81M
|
||||||
|
L 282 + 6 + 81M
|
||||||
|
L 283 + 6 + 81M
|
||||||
|
L 289 + 282 + 81M
|
||||||
|
L 289 + 283 + 81M
|
||||||
|
L 297 - 232 + 81M
|
||||||
|
L 297 + 289 + 81M
|
||||||
|
L 333 - 232 + 81M
|
||||||
|
L 333 + 289 + 81M
|
@ -28,7 +28,7 @@ from Bio.SeqRecord import SeqRecord
|
|||||||
# warnings to stdout and verifying via the print-and-compare check. However,
|
# warnings to stdout and verifying via the print-and-compare check. However,
|
||||||
# there was some frustrating cross-platform inconsistency I couldn't resolve.
|
# there was some frustrating cross-platform inconsistency I couldn't resolve.
|
||||||
|
|
||||||
possible_unknown_seq_formats = {"embl", "genbank", "gb", "imgt", "qual"}
|
possible_unknown_seq_formats = {"embl", "genbank", "gb", "imgt", "qual", "gfa1", "gfa2"}
|
||||||
|
|
||||||
# List of formats including alignment only file formats we can read AND write.
|
# List of formats including alignment only file formats we can read AND write.
|
||||||
# The list is initially hard coded to preserve the original order of the unit
|
# The list is initially hard coded to preserve the original order of the unit
|
||||||
@ -1844,6 +1844,61 @@ class TestSeqIO(SeqIOTestBaseClass):
|
|||||||
messages,
|
messages,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_gfa1(self):
|
||||||
|
sequences = [
|
||||||
|
"GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCT...ACATTAT",
|
||||||
|
"TATAATAAACACCCTCACCACTACAATCTTCCTAGGAACA...CTACTCT",
|
||||||
|
"GGGGTAAATGATGGGTTGGGCCAAGGGGTTAATTAGTACG...TATTAAG",
|
||||||
|
"ACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGC...TCTGAGC",
|
||||||
|
"ATTCTACCACTCCAGCCTAGCCCCCACCCCTCAACTTGGA...TATAAAC",
|
||||||
|
"CTTTTACCACTCCAGCCTAGCCCCTACCCCCCAATTAGGA...AGCCCAA",
|
||||||
|
"TTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCC...AAAGACC",
|
||||||
|
"ACATCATCGAAACCGCAAACATATCATACACAAACGCCTG...CACGATG",
|
||||||
|
]
|
||||||
|
ids = [
|
||||||
|
"MTh0",
|
||||||
|
"MTh4001",
|
||||||
|
"MTo3426",
|
||||||
|
"MTh4502",
|
||||||
|
"MTo8961",
|
||||||
|
"MTh9505",
|
||||||
|
"MTh13014",
|
||||||
|
"MTh13516",
|
||||||
|
]
|
||||||
|
names = ids
|
||||||
|
lengths = [4001, 501, 501, 5003, 502, 3509, 502, 3053]
|
||||||
|
molecule_types = {
|
||||||
|
"embl": "DNA",
|
||||||
|
"genbank": "DNA",
|
||||||
|
"imgt": "DNA",
|
||||||
|
"seqxml": "DNA",
|
||||||
|
"nexus": "DNA",
|
||||||
|
}
|
||||||
|
alignment = None
|
||||||
|
messages = {
|
||||||
|
"fastq": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||||
|
"fastq-illumina": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||||
|
"fastq-solexa": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||||
|
"nib": "More than one sequence found",
|
||||||
|
"phd": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||||
|
"qual": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||||
|
"sff": "Missing SFF flow information",
|
||||||
|
"xdna": "More than one sequence found",
|
||||||
|
}
|
||||||
|
self.perform_test(
|
||||||
|
"gfa1",
|
||||||
|
False,
|
||||||
|
"GFA/seq.gfa",
|
||||||
|
8,
|
||||||
|
ids,
|
||||||
|
names,
|
||||||
|
sequences,
|
||||||
|
lengths,
|
||||||
|
alignment,
|
||||||
|
messages,
|
||||||
|
molecule_types=molecule_types,
|
||||||
|
)
|
||||||
|
|
||||||
def test_nexus1(self):
|
def test_nexus1(self):
|
||||||
sequences = [
|
sequences = [
|
||||||
"A-C-G-Tc-gtgtgtgctct-t-t------ac-gtgtgtgctct-t-t",
|
"A-C-G-Tc-gtgtgtgctct-t-t------ac-gtgtgtgctct-t-t",
|
||||||
|
77
Tests/test_SeqIO_Gfa.py
Normal file
77
Tests/test_SeqIO_Gfa.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
"""Tests for SeqIO GFA module."""
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from Bio import BiopythonWarning
|
||||||
|
from Bio import SeqIO
|
||||||
|
|
||||||
|
|
||||||
|
class TestRead(unittest.TestCase):
|
||||||
|
def test_read_GFA1(self):
|
||||||
|
"""Test parsing valid GFA 1.x files."""
|
||||||
|
records = list(SeqIO.parse("GFA/seq.gfa", "gfa1"))
|
||||||
|
self.assertEqual(len(records), 8)
|
||||||
|
self.assertEqual(records[6].id, "MTh13014")
|
||||||
|
self.assertEqual(
|
||||||
|
records[6].seq,
|
||||||
|
"TTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGACC",
|
||||||
|
)
|
||||||
|
self.assertEqual(records[0].annotations["SN"], ("Z", "MT_human"))
|
||||||
|
self.assertEqual(records[0].annotations["SO"], ("i", "0"))
|
||||||
|
|
||||||
|
records = list(SeqIO.parse("GFA/seq_with_len.gfa", "gfa1"))
|
||||||
|
self.assertEqual(len(records), 9)
|
||||||
|
self.assertEqual(
|
||||||
|
records[8].seq,
|
||||||
|
"GAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTT",
|
||||||
|
)
|
||||||
|
|
||||||
|
records = list(SeqIO.parse("GFA/fake_with_checksum.gfa", "gfa1"))
|
||||||
|
self.assertEqual(len(records), 1)
|
||||||
|
self.assertEqual(records[0].seq, "AAA")
|
||||||
|
|
||||||
|
records = list(SeqIO.parse("GFA/no_seq.gfa", "gfa1"))
|
||||||
|
self.assertEqual(len(records), 9)
|
||||||
|
self.assertEqual(len(records[0]), 528)
|
||||||
|
|
||||||
|
def test_read_GFA2(self):
|
||||||
|
"""Test parsing valid GFA 2.0 files."""
|
||||||
|
records = list(SeqIO.parse("GFA/fake_gfa2.gfa", "gfa2"))
|
||||||
|
self.assertEqual(len(records), 1)
|
||||||
|
self.assertEqual(records[0].seq, "AAA")
|
||||||
|
|
||||||
|
|
||||||
|
class TestCorrupt(unittest.TestCase):
|
||||||
|
def test_corrupt_gfa2(self):
|
||||||
|
"""Check a GFA 1.x file does not parse in GFA 2."""
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
list(SeqIO.parse("GFA/seq.gfa", "gfa2"))
|
||||||
|
|
||||||
|
def test_corrupt_segment_fields(self):
|
||||||
|
"""Check a GFA file with invalid fields on a segment line."""
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
list(SeqIO.parse("GFA/corrupt_segment_fields.gfa", "gfa1"))
|
||||||
|
|
||||||
|
def test_corrupt_len(self):
|
||||||
|
"""Check a GFA file with an incorrect length."""
|
||||||
|
with self.assertWarns(BiopythonWarning):
|
||||||
|
list(SeqIO.parse("GFA/corrupt_len.gfa", "gfa1"))
|
||||||
|
|
||||||
|
def test_corrupt_checksum(self):
|
||||||
|
"""Check a GFA file with an incorrect checksum."""
|
||||||
|
with self.assertWarns(BiopythonWarning):
|
||||||
|
list(SeqIO.parse("GFA/corrupt_checksum.gfa", "gfa1"))
|
||||||
|
|
||||||
|
def test_corrupt_tag_name(self):
|
||||||
|
"""Check a GFA file with an invalid tag name."""
|
||||||
|
with self.assertWarns(BiopythonWarning):
|
||||||
|
list(SeqIO.parse("GFA/corrupt_tag_name.gfa", "gfa1"))
|
||||||
|
|
||||||
|
def test_corrupt_tag_type(self):
|
||||||
|
"""Check a GFA file with an incorrect tag type."""
|
||||||
|
with self.assertWarns(BiopythonWarning):
|
||||||
|
list(SeqIO.parse("GFA/corrupt_tag_type.gfa", "gfa1"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
runner = unittest.TextTestRunner(verbosity=2)
|
||||||
|
unittest.main(testRunner=runner)
|
Reference in New Issue
Block a user