mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Add SeqIO support for GFA files. (#4598)
* Adds tags from segment lines into the annotations dictionary of the SeqRecord, handles undefined sequences and sets their length according to LN tag, raises a warning on incorrect tag types, and adds more links to the format documentation. * Check for valid tag name and store tag types in annotations * Tag value may contain colons * Update NEWS.rst
This commit is contained in:
177
Bio/SeqIO/GfaIO.py
Normal file
177
Bio/SeqIO/GfaIO.py
Normal file
@ -0,0 +1,177 @@
|
||||
"""Bio.SeqIO support for the Graphical Fragment Assembly format.
|
||||
|
||||
This format is output by many assemblers and includes linkage information for
|
||||
how the different sequences fit together, however, we just care about the
|
||||
segment (sequence) information.
|
||||
|
||||
Documentation:
|
||||
- Version 1.x: https://gfa-spec.github.io/GFA-spec/GFA1.html
|
||||
- Version 2.0: https://gfa-spec.github.io/GFA-spec/GFA2.html
|
||||
"""
|
||||
|
||||
import warnings
|
||||
import hashlib
|
||||
import re
|
||||
|
||||
from Bio import BiopythonWarning
|
||||
from Bio.File import as_handle
|
||||
from Bio.Seq import Seq, _UndefinedSequenceData
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
|
||||
|
||||
def _check_tags(seq, tags):
|
||||
"""Check a segment line's tags for inconsistencies (PRIVATE)."""
|
||||
for tag in tags:
|
||||
if tag[:2] == "LN":
|
||||
# Sequence length
|
||||
if len(seq) == 0:
|
||||
# No sequence data, set the sequence length
|
||||
seq._data = _UndefinedSequenceData(int(tag[5:]))
|
||||
elif int(tag[5:]) != len(seq):
|
||||
warnings.warn(
|
||||
f"Segment line has incorrect length. Expected {tag[5:]} but got {len(seq)}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
elif tag[:2] == "SH":
|
||||
# SHA256 checksum
|
||||
checksum = hashlib.sha256(str(seq).encode()).hexdigest()
|
||||
if checksum.upper() != tag[5:]:
|
||||
warnings.warn(
|
||||
f"Segment line has incorrect checksum. Expected {tag[5:]} but got {checksum}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
|
||||
|
||||
def _tags_to_annotations(tags):
|
||||
"""Build an annotations dictionary from a list of tags (PRIVATE)."""
|
||||
annotations = {}
|
||||
for tag in tags:
|
||||
parts = tag.split(":")
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"Segment line has invalid tag: {tag}.")
|
||||
if re.fullmatch(r"[A-Za-z][A-Za-z0-9]", parts[0]) is None:
|
||||
warnings.warn(
|
||||
f"Tag has invalid name: {parts[0]}. Are they tab delimited?",
|
||||
BiopythonWarning,
|
||||
)
|
||||
parts[2] = ":".join(parts[2:]) # tag value may contain : characters
|
||||
annotations[parts[0]] = (parts[1], parts[2])
|
||||
|
||||
# Check type of the tag and raise warning on a mismatch. These RegExs
|
||||
# are part of the 1.0 standard.
|
||||
if parts[1] not in "AifZJHB":
|
||||
warnings.warn(f"Tag has invalid type: {parts[1]}", BiopythonWarning)
|
||||
elif parts[1] == "A" and re.fullmatch(r"[!-~]", parts[2]) is None:
|
||||
warnings.warn(
|
||||
f"Tag has incorrect type. Expected printable character, got {parts[2]}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
elif parts[1] == "i" and re.fullmatch(r"[-+]?[0-9]+", parts[2]) is None:
|
||||
warnings.warn(
|
||||
f"Tag has incorrect type. Expected signed integer, got {parts[2]}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
elif (
|
||||
parts[1] == "f"
|
||||
and re.fullmatch(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", parts[2])
|
||||
is None
|
||||
):
|
||||
warnings.warn(
|
||||
f"Tag has incorrect type. Expected float, got {parts[2]}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
elif parts[1] == "Z" and re.fullmatch(r"[ !-~]+", parts[2]) is None:
|
||||
warnings.warn(
|
||||
f"Tag has incorrect type. Expected printable string, got {parts[2]}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
elif parts[1] == "J" and re.fullmatch(r"[ !-~]+", parts[2]) is None:
|
||||
warnings.warn(
|
||||
f"Tag has incorrect type. Expected JSON excluding new-line and tab characters, got {parts[2]}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
elif parts[1] == "H" and re.fullmatch(r"[0-9A-F]+", parts[2]) is None:
|
||||
warnings.warn(
|
||||
f"Tag has incorrect type. Expected byte array in hex format, got {parts[2]}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
elif (
|
||||
parts[1] == "B"
|
||||
and re.fullmatch(
|
||||
r"[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+", parts[2]
|
||||
)
|
||||
is None
|
||||
):
|
||||
warnings.warn(
|
||||
f"Tag has incorrect type. Expected array of integers or floats, got {parts[2]}.",
|
||||
BiopythonWarning,
|
||||
)
|
||||
return annotations
|
||||
|
||||
|
||||
def Gfa1Iterator(source):
|
||||
"""Parser for GFA 1.x files.
|
||||
|
||||
Documentation: https://gfa-spec.github.io/GFA-spec/GFA1.html
|
||||
"""
|
||||
with as_handle(source) as handle:
|
||||
for line in handle:
|
||||
if line == "\n":
|
||||
warnings.warn("GFA data has a blank line.", BiopythonWarning)
|
||||
continue
|
||||
|
||||
fields = line.strip("\n").split("\t")
|
||||
if fields[0] != "S":
|
||||
continue
|
||||
if len(fields) < 3:
|
||||
raise ValueError(
|
||||
f"Segment line must have name and sequence fields: {line}."
|
||||
)
|
||||
|
||||
if fields[2] == "*":
|
||||
seq = Seq(None, length=0)
|
||||
else:
|
||||
seq = Seq(fields[2])
|
||||
|
||||
tags = fields[3:]
|
||||
_check_tags(seq, tags)
|
||||
annotations = _tags_to_annotations(tags)
|
||||
|
||||
yield SeqRecord(seq, id=fields[1], name=fields[1], annotations=annotations)
|
||||
|
||||
|
||||
def Gfa2Iterator(source):
|
||||
"""Parser for GFA 2.0 files.
|
||||
|
||||
Documentation for version 2: https://gfa-spec.github.io/GFA-spec/GFA2.html
|
||||
"""
|
||||
with as_handle(source) as handle:
|
||||
for line in handle:
|
||||
if line == "\n":
|
||||
warnings.warn("GFA data has a blank line.", BiopythonWarning)
|
||||
continue
|
||||
|
||||
fields = line.strip("\n").split("\t")
|
||||
if fields[0] != "S":
|
||||
continue
|
||||
if len(fields) < 4:
|
||||
raise ValueError(
|
||||
f"Segment line must have name, length, and sequence fields: {line}."
|
||||
)
|
||||
try:
|
||||
int(fields[2])
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Segment line must have an integer length: {line}."
|
||||
) from None
|
||||
|
||||
if fields[3] == "*":
|
||||
seq = Seq(None, length=0)
|
||||
else:
|
||||
seq = Seq(fields[3])
|
||||
|
||||
tags = fields[4:]
|
||||
_check_tags(seq, tags)
|
||||
annotations = _tags_to_annotations(tags)
|
||||
|
||||
yield SeqRecord(seq, id=fields[1], name=fields[1], annotations=annotations)
|
@ -291,6 +291,10 @@ names are also used in Bio.AlignIO and include the following:
|
||||
- gck - Gene Construction Kit's format.
|
||||
- genbank - The GenBank or GenPept flat file format.
|
||||
- gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
|
||||
- gfa1 - Graphical Fragment Assemblyv versions 1.x. Only segment lines
|
||||
are parsed and all linkage information is ignored.
|
||||
- gfa2 - Graphical Fragement Assembly version 2.0. Only segment lines are
|
||||
parsed and all linkage information is ignored.
|
||||
- ig - The IntelliGenetics file format, apparently the same as the
|
||||
MASE alignment format.
|
||||
- imgt - An EMBL like format from IMGT where the feature tables are more
|
||||
@ -380,6 +384,7 @@ from Bio.SeqIO import AbiIO
|
||||
from Bio.SeqIO import AceIO
|
||||
from Bio.SeqIO import FastaIO
|
||||
from Bio.SeqIO import GckIO
|
||||
from Bio.SeqIO import GfaIO
|
||||
from Bio.SeqIO import IgIO # IntelliGenetics or MASE format
|
||||
from Bio.SeqIO import InsdcIO # EMBL and GenBank
|
||||
from Bio.SeqIO import NibIO
|
||||
@ -420,6 +425,8 @@ _FormatToIterator = {
|
||||
"gck": GckIO.GckIterator,
|
||||
"genbank": InsdcIO.GenBankIterator,
|
||||
"genbank-cds": InsdcIO.GenBankCdsFeatureIterator,
|
||||
"gfa1": GfaIO.Gfa1Iterator,
|
||||
"gfa2": GfaIO.Gfa2Iterator,
|
||||
"imgt": InsdcIO.ImgtIterator,
|
||||
"nib": NibIO.NibIterator,
|
||||
"cif-seqres": PdbIO.CifSeqresIterator,
|
||||
|
6
NEWS.rst
6
NEWS.rst
@ -63,11 +63,17 @@ The wwPDB organization announced that they plan to deprecate the FTP
|
||||
server by the end of the year. See the announcement
|
||||
`here <https://www.wwpdb.org/news/news?year=2023#65562f0ad78e004e766a96c1>`_.
|
||||
|
||||
``Bio.SeqIO`` now supports reading sequences from the Graphical Fragment
|
||||
Assembly (GFA) files with the formats ``gfa1`` and ``gfa2`` (for GFA 1.x and
|
||||
GFA 2.0 files respectively). All data outside of segment lines are ignored, such
|
||||
as linkage information.
|
||||
|
||||
Many thanks to the Biopython developers and community for making this release
|
||||
possible, especially the following contributors:
|
||||
|
||||
- Anil Tuncel (first contribution)
|
||||
- Fabio Zanini (first contribution)
|
||||
- Michael M. (first contribution)
|
||||
- Michiel de Hoon
|
||||
- Peter Cock
|
||||
- Will Tyler (first contribution)
|
||||
|
1
Tests/GFA/corrupt_checksum.gfa
Normal file
1
Tests/GFA/corrupt_checksum.gfa
Normal file
@ -0,0 +1 @@
|
||||
S fake AAA SH:H:FFFFFFFFFFFFFFF69566510EE712661F9F14B83385006EF92AEC47F523A38358
|
1
Tests/GFA/corrupt_len.gfa
Normal file
1
Tests/GFA/corrupt_len.gfa
Normal file
@ -0,0 +1 @@
|
||||
S fake AAA LN:i:2
|
1
Tests/GFA/corrupt_segment_fields.gfa
Normal file
1
Tests/GFA/corrupt_segment_fields.gfa
Normal file
@ -0,0 +1 @@
|
||||
S fake
|
1
Tests/GFA/corrupt_tag_name.gfa
Normal file
1
Tests/GFA/corrupt_tag_name.gfa
Normal file
@ -0,0 +1 @@
|
||||
S fake AAA ~~:i:0
|
1
Tests/GFA/corrupt_tag_type.gfa
Normal file
1
Tests/GFA/corrupt_tag_type.gfa
Normal file
@ -0,0 +1 @@
|
||||
S fake AAA AB:i:C
|
1
Tests/GFA/fake_gfa2.gfa
Normal file
1
Tests/GFA/fake_gfa2.gfa
Normal file
@ -0,0 +1 @@
|
||||
S fake 3 AAA
|
1
Tests/GFA/fake_with_checksum.gfa
Normal file
1
Tests/GFA/fake_with_checksum.gfa
Normal file
@ -0,0 +1 @@
|
||||
S fake AAA SH:H:CB1AD2119D8FAFB69566510EE712661F9F14B83385006EF92AEC47F523A38358
|
21
Tests/GFA/no_seq.gfa
Normal file
21
Tests/GFA/no_seq.gfa
Normal file
@ -0,0 +1,21 @@
|
||||
S 232 * LN:i:528 KC:i:51170
|
||||
S 277 * LN:i:893 KC:i:50561
|
||||
S 280 * LN:i:895 KC:i:37634
|
||||
S 282 * LN:i:1819 KC:i:106341
|
||||
S 283 * LN:i:1854 KC:i:82138
|
||||
S 289 * LN:i:163 KC:i:14624
|
||||
S 297 * LN:i:4149 KC:i:248525
|
||||
S 333 * LN:i:4399 KC:i:191002
|
||||
S 6 * LN:i:89 KC:i:9779
|
||||
L 6 + 277 - 81M
|
||||
L 6 + 280 + 81M
|
||||
L 232 + 277 + 81M
|
||||
L 280 + 232 - 81M
|
||||
L 282 + 6 + 81M
|
||||
L 283 + 6 + 81M
|
||||
L 289 + 282 + 81M
|
||||
L 289 + 283 + 81M
|
||||
L 297 - 232 + 81M
|
||||
L 297 + 289 + 81M
|
||||
L 333 - 232 + 81M
|
||||
L 333 + 289 + 81M
|
19
Tests/GFA/seq.gfa
Normal file
19
Tests/GFA/seq.gfa
Normal file
File diff suppressed because one or more lines are too long
21
Tests/GFA/seq_with_len.gfa
Normal file
21
Tests/GFA/seq_with_len.gfa
Normal file
@ -0,0 +1,21 @@
|
||||
S 232 CCTTATACGAAGCCCAGGTTAATCCTGGGCTTTTTGTTGAATCTGATCATTGGTAGCAAACAGATCAGGATTGGTAATTTTGATGTTTTCCTGACAACTCCTGCAAAGCATCAGCCCAGCAAAAAGTTGTACATGTTCCGTTGATTCACAGAAGGCACATGGCTTAGGAAAAGAGATGATATTGGCTGAATTGACTAATTCTGTTGACATAAGAAGTAACCTTGGATTGTACATATTTATTTTTAATAAATTTCAATACTTTATACCTAATTTAGCCACTAAAATTTGTACATTATTGTATAATCCATGTGTACATATTATTTTTTATAATTTTCATTCACTTAGACCCAAAATAGTATTTATTTTTGTACAACACCATGTACAGAACAATAATCATATAAATCAAATTTTTAGCATAAAAAAGTCCATTAATTTTGTACACAATTCTGAAACTTAAAAATCTAAACTTTCATCAATTTTTTTCATAATTTCAATAAATTAACCTTAATTTTAAGATATATTCTGAAA LN:i:528 RC:i:51170
|
||||
S 277 TGAAACTTAAAAATCTAAACTTTCATCAATTTTTTTCATAATTTCAATAAATTAACCTTAATTTTAAGATATATTCTGAAATTTGGTTTGAAAGCTGTTTTTACATTATATTTCAATACTTTAAATCAAAAAATTGGATATTTTTTGAAAAACTCAATGAAAGTTTATTTTTTATTTAAAAACAACTAGTTATATTAGTTTTTATCCATTTTTTTGAAACAGTTTCTATATGATAAAAAAACCTATAAAAACCATATCTAGCAAAGGTTTGAGGGTTATCATCTTTAGATGCGTGGTGTGTGACAAAAAAATCCCGGCATGTGCCGGATTCTGGATTAGAAAATTGGCTAAAGTGACGTAGGACTGGTGCTTGGTTTTACATGGAAAAAAGTATTTATTTTCTGGTTTATAAAAACGTAAAAAGATCAGTTTTTGTTCATTCATCCAGGTTAAAAATTTCAACCTAAAACTTTAATTATGAAAAGCTTCACAGAAAGCATTCAAATGCGATTTAAGAGCCTTTATCTAAAAAACATAGATCTTATAGCGAAAAACAGAAAACAGCTCAAAAAACGCAAAAGAGAGTGAAGTAAAGAGATGTTTTGACTTTAGATAGCTGCATAGAGCGAGTGTCTACGAGCGAACTATCAAAATTTGCGTCTAGACTCTCTGAAAAACATTTTTTTGCCCTCTTTAGCCTAAGAAAGCTTAATTTTCATGCAGAAATTTGCTCCTGGACCGAGCGTAGCGAGAAAAAAAGCTCATGAGCGAAGCGAATTCCGAGTTGCTTTTGCTTTTTCTTAAAGTCACGCAAGTATTAACCAAAAAATTGCCCCGACGAACTGAGCGAAAGCGAAGTTCAATAGAGTTTGAGCGAAGCGAAAACCAAGGGC LN:i:893 RC:i:50561
|
||||
S 280 GCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTTACGTGATTTTAAGAAAAAGCAAAAACAACTCGGAATTCGCTTCGCTCATAAAACTTTTTTACTCGCTACGCTCGGTCTAGGAACAAATTTCTGCATAAAACTTACGCTTTCTTAGGCTAAAAAGCCCCAAAAACTGTTTTTCAGAAAGGCTGAGAGGCAAATTTTATAAGTTCGCTCGTAGACACTCGCTCTATGAATCTATCTAGAATCAAAATCCCTCTTTATTTCGCGCTCATTTGCGTTTTTTGAGCTGTTTTCTGTTTTTCACTATAAGACCTATGTTTTTTACTTAAAGGCTCTTAAATCGCATTTGAATGCTTTCTGTGAAAATTTCCATAATTAAAATTTTAGTTTGAAATTTTTAATTTGGATGAACGGACAAAAAATGATTTTTTTACGTTTTTCTAAACCAAGAAATAAATACTTTTTTCTATATAAAACCAAGCACCAGTCCTACGTCACTTTAGCCAATTTTCTAATCAAGGATCCGGCACATGCCGGGATTTTTTTGTCACACACCACGCATCTAAAGATGATAACCCTCAAACCTTTGCTAGTTATGGTTTTTATAGGTTTTTTTATCATATAGAAACTGTTTCAAAAAAATGGATAAAAAATAATATAACTAGTTGTTTTTAAATAAAAAATAAACTCTCATTGAGTTTTTCAAAAAATATCCAATTTTTTGATTTAAAGCATTGAAATATAATGTAAAAACAACTTTCAAACCAACTTTCAGAATATATCTTAAAATTAAGGTTAATTTATTGAAATTATGAAAAAAATTGATGAAAGTTTAGATTTTTAAGTTTCA LN:i:895 RC:i:37634
|
||||
S 282 AGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCATGTTTAGGATCATCAATATTAGGTTCTGGCTGTACTTGAGCTACATTCGCTTGCGACTGTTCTTGATCGGTAAACGTAGTCATATTGGCTTTTGGTGCTTCTAGTAAGCGCTGCATGGCCTCGATCTGATCTTGATAAAACGATTCACGTTCTAGTGATTGATTTTCTCTAAACAGTGCTTGATTTAACTGTTTTTCCAGTATGTCAACTTGACGTTTTAATAGGTCAACTTCTGTCAAGTTTTGACTGTTAATTGATTGACTTTGATTGACATTAGTATCTTTCTTTTGTGGTTCCCCAAAAACTCTTATGGCTTCTGCGAAGTCAATTAATCCATCAGATCCTTTAGATAAATTTCCTTTGTTTATATGGGCATATATGGCTTGTCGTGAATATCCGTAAAGCTTAGCCAACTCTGAAACTGACAGTTTTTTCATTATGTAAACCAGTTTTCAATTTGTGTTAATACTTGACTGTCAACTTTACAATGTCAATTGACATCATTTATTGAAAGCCAACTTTTCGAAGTAATGGAGTAAGTTCTTTTAATTTCTCCGGCTGTTTTAGCATATCTGCAATCCGTACAGCAAATTGTTCATAGCTTTCGGTGCCTTGTGAAAGTTTTCCCATTTCTGGAAGCTCTGACAATTTACTCGCAAATGAATAGCGTTGTGCATCTGTTAAAGTGATCGTTATATCTTCGCTTACACCTATACTTTTAGCTGACTTCGTTACTGATTTTTTTTGTTTAAAGCTAAATGAAAAACCTGTAATTGATCTACCAGTCTTATGCTGTTCAACTTTAACAGTAATGTCGGTATGTTCATTTACTTGCTTTAATGCAATGTCTAAGACATATTTTTTAAAGTCATACATTCGTTTGTATTCAGTATCGAGTACACCTATTTTTTGTCTGAAATCAGACAAAGTTATAAGAGGCGTTTTTCCTGTACTACGCCATGCAATCAATATCTCATATAACCGAACGGCATAAGCACTTGTTAAATTGCTTATTTGTTGTATTTCATACTTTGTAAATTGTTCTTCTAGTCTTGTAATTAAAGGCACGATAGCGGGAGCAAAAATAAGTCTAACAACCGCTTCATTATCAATATAAGCGACTTCACTCACCCATCTTGACTTGTGATTAATAATGTTGCCTTTTTCACTAAGACTTTGATAACTGAATTGTCTTGCAAATAGATCATCACAAGCATCTTTTAATGCCTGATAAGCCGTGTTTCGATGTACACCAAATTGATTGATGTAGCTTTCAGCATGAACTGTTAATGGATCATTAGCATTTATTCCCTTACCCGATTCCCTTGCTTCAACGATAGCTAGAAGAATTAACCGTTGCTCGACTAAATCAAGGTTATAACTGGCATTAATTAATGCATTATCTTTAACTATTAGTTCTGTTTTCATATGAAAAGGTTATATTTTTAATTCAATAAGACGAGGCTATATCATAAACTCGCTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTATAGTCTCTAAGAATTTTATAAATAAAGGCTTTCAGCATGTCTAAAAGCATTTAAAAGATTTAAAAATATTTAAAAGCTCAGGTCATGAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGT LN:i:1819 RC:i:106341
|
||||
S 283 AGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCATCTTTAGGTGGAACAACTATCACCTGCTCCTTCATTTCTACAATGGGAGCTGATTGAGGGAGTTCGGGTTCACTAGGCTCTTGGATCTCAGGTTTAGGTGCTTCCAGTAGACGTTGCATTGTCTCAATTTGATTTTGATAAAAGGATTCACGCTCTTTTGCATCATCAAGCTGATTTTGCAGCATGTTTATCTGTTGTCTAAGTAGTAAAACTTCTGTCGATATTGGACTGTCTATTTTTACAGATTCTTTTACATCCTGTTTTTTTGAAGGTTCACCAAAAACACGGATAGCCTCGGCTAGATCAATTTTATTATCAGAATTTTTACTTAAAATTCCTTTCTTTATGTTGTTGTAAATCGTTTGTCTATTGATATTGTAAAGCTTAGAAAGCTCAAGAACTGTAAGGCTTTTCATGTTGTAAATCTATGTCTAAACCTGTCAAAAGTTGTTGTCTATTAGACTGCCTAATAGACAAATGATCAAGCATAAATTTTAGGCAGCTTGATAGCCAACTTTTTGAAGATAAGGAAATAATTCTTGGAATTTTTGATAATCTTGGAGCATTTCTGCAATGCGTATAGCAAATTGTTGGAAACTTTCGGTGCCTTCTGAATATTGCCCCATCTCAGGTAATTCAGAAAGTTTGTTGGCAAAGAGATGACGTTGTTTATCACTCATTTTGGTGAAAAGGGCTAATGTATCCGTTCCCTTAATTACTTTGTCAGAGTTGGTTTTTTTCTGCTTAAAACTGAAAGAAAAACCGGAAATTGAACGTCCTTTTTTATGTTGTTCATATTTAGCCTTTACATCGGTTAACTCATTTACTTGCTTGAGGGCAAAATCTAAAACATACGTTTTAAAATTACTCATTGTTTTGTATTGATGAGCTTCGACACCAAGTTGTTGTCTAAAGATCGATAGATCAAAAATTGGCGTTTTACCAGCAGATCTCCATTGAATTAATAACTCGTATAACCGAATAGCATAAGAGCTAGTTAATCTCGAAACCTGCTCCAATTCATATTTAGTAAAGTTTTCTTCCAGTCGAGTTATGAGTGGAACAATGTCTTGCGTGAAACGTAGGAAAACGATCCCGGATTGAGGTTCATATCCAATTTTATCGACCCAACGACAGTGAAAGCTACGATCCTTACCAGTCTTAGGATTAATGTCATGATACGTGACATAGCGATCAAACAAGCTTTTAGATGCATCCCTAAGCACGGTATAAGCAGTGTGTTTCTCGACATTAAAGGTATTGATATAACTACTTGCATGTACTTCTAAAAGGCTATTTTCAGTTATCCCGTGCCCTGTTTCCCGAGCTTCAGCAATGGCTAATAGGATCAGCCTTTGTTCAACTGTATCTAGGGTATAGCTAGCTTGAATTAAAGCATTATCCTTTACGATTAATTCACTCATTTCTAATAATAAATTTATAATTAAACTAGGTTGTACAATAGACTATTTTAATTGAATTGTAAATCGACTTAATTTGGGGTCGATAAACCTAGTTTTAAGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGGTTTATTAACTCTAAAACTATTATGAATAAAGGGTTACAGCTTACTTAAAAGCATTTAAAAGATTAAAAATAATTAAAAGCCTAGGTAAAGAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGT LN:i:1854 RC:i:82138
|
||||
S 289 CACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTCCAGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCAT LN:i:163 RC:i:14624
|
||||
S 297 AAAATTACCAATCCTGATCTGTTTGCTACCAATGATCAGATTCAACAAAAAGCCCAGGATTAACCTGGGCTTCGTATAAGGTGTATTATGTTAATTTTAGAAATTATTAAATGATTCCAAGATTTTCTAGCGACTTATAAGTAATTTCATTACGAATAGAACCAGACATTCCTTCTTTCATTTCTAAGTTGAGCGAAAAGGGGATTTTTTTTCCATTAGCTTGCTCCACCCAACCAGTCAACCAACCTACCTGTGGAGTAACATCCATTCCCCATCCACTTTTTGCATAAATCTTACTACCATTTACTTCTTTAATTAGAAGCATTTTTTTAACTTCTTCTTGAGTTTCTAATTTAAAAGGTAATCGGTTATGTGCAAGGTCATCGGCAAAATTAACTTCTTGTACTGGTGTAATTTTAAGGGGGCCAACTAACCAAAAATTATCGACCTGTGTTCCAATATTTGTATTTCCAAAATTAACCCGCTTTACTTCTTTCTGCATTAGCTCTAGGCCAGTCCGTCTTGCAAGCTCTTGATATACTGGAACTGCTGACAATGCCATTGCCTCACCTAAAGTCATATCTTTCTCCCACATAGGATAAGTTCTTTTTTTACCATCCCATTTGAAAATCTCATTTGTTGTTGCTTTATGATTTTCTAGCCCGATTAAAGCATTTAGCATCTTAAATGTTGATGCAGGGACATATTCTTTATTTGCTCGTGCAAGAGCATTACCATAGGTGCTAAGATTTTTACCCTCTTTAATAATAATTACACCCTGTGTTTGAGCTTCATCAAAATAGCTTTTAATAGCTTTTTCATGTTGCTGAGAAGAAATATGAAAATTATCTTCAGATTTAGTTTTAATAGATGAACATGCACTGAGAGAAACTAGAATAGAAATGCTGAATATAGGAAGTATAAATTTTTTCATTACAAATTCATGTTAGGGGAAATTTTGGGGCTTAGAGCATAATATTTTTATTAAAAAAGTTAAAACTCTTTTAATTTAACATAATGGGCGTTATCCGAAATGGCCCTTTAAGTTTACTTTAAGATCAACAACTTACATTGAGTTAAACCCCTGGAACGATCTCGTTCAACCGATTTAGAAATAAATCGGTTTTTTTATGTTCAATTCTGATACTTCTTGCCAATAAAATTGGTTAAAAAATGATCAAAAATACCAATTCCATAGAGTTTGTAAGCCTTTTTTACCCAAAAGAATCCACAATTTCTGCGGATAACCATTGGGATAAATTTTGATTATTTTTTGTACGTTTTGGCTTACAGCGTTTCACGAAATTGTCTGTATCTATCCCCCGATGAAGCTCTTATGATCGATCTATATGGACAGGATGTTCAATGACTTAAAACAACAATAATAATCAGTCAGGCACCATGACCATCAGGTAGTCAGCCCTACCTATGCGCTCCATACCGAAAAATCCCGACATGACGTCGGGATTTTTTATTGCCATTTGCAAAGTCATTCAATCAGATTTAATATATAGACTGTAATCTATATTGTGATTGTAATATGTGGACAGTCATTACAACGGATCTGTTTAATGAATGGCTTGAACAGCAAGATGAAGCAACACAAGAGAAAGTCTTAGCTGCATTGGTTGTTCTACAACAGCAAGGGCCTAGTTTAGGTCGTCCTTTAGTAGATACCGTCTACGATTCTAAATTTACCAATATGAAGGAACTACGTGTTCAGCATCGTGGTAAACCGTTAAGGGCATTTTTTGCATTCGATCCCCTAAGACAAGCGATTGTGCTGTGTATTGGTGATAAGAGTAATAAAAAGCGTTTTTATACAGAGATGCTAGCAATTGCAGATGAGCAATACGCACTTCATCTCTCAACTTTAGGAGATCAATCTAATGGCTAAGAGCTTACAAGAATTATTGGCTAGTCGCTCTCCAGAGAGTCAAGCTCGTATTCAAAAAATGGCTGATGAATTACTTTTAGAAAGTCAGCTTCACCTTATCCGAGAAGAGTTAGAAATTTCCCAGAAAGAACTAGCTGCAACCCTTGGAATTAAACAGCCGTCATTATCAGCAATCGAAAATCGTGGTCATGATCTTAAAATTTCAACGATGAAAAAGTATGTTGAAGCAATGGGTGGAAAACTTCGTATTGATGTAGAGCTTCCAACAGGAAAACACATAGGGTTCAATGTCTAATAATAAACCTTTGAAAAATAGTACTAAATTACTTGTTAACTTAGACAAGTTTATATTTTTAGTCAATGCTGCCGATAGCCTAGAAGAAATTGAAATTATTAGAGATTTATGCTGTGAATATTTTTCACATTGTAAGAGGCCAAGCTATTACATTGATATTTTTGACAATGCTTATTGGATAAAATATTACGAAATAGCTTAATCCTTCTAGATTCACATAGGAGTTTCTATGTTAAATAGGACTAAAAATATACTCTAATTTTAGAGTTTTCTTTTAGGTATGATGTAAAAACATACAAGCCTAAGAGTTTAATTTAAAGGTTAAGAATTGAATAAATTAGTAAATATTTTTATGGCTTTAGGAGTTTGCCTTTCAATGGAAACATTTGCAGAACCTGTTTTTTCTAATGATTTATTAGCGAAAGCAGAAAGTGGAGATACTTCTGCCCAGTTAGAATTAGCTGAGATTTATCTATATGGTCATGGTGTTGATTCAGATGAAAATCAAGCTGAAATTTGGGCTATTAAATCAGCTGAAAATGGAAATGTAGCAGCAATGTTTTGGTTAGCTGATGGATATGTTACCTATGCTAGATTAATAGAGGATGATGACAAAAACGATTCTTTAGAACATTTCCAAAAAGCTTTTAAGTGGTTTCAGAAAGCTTCAGAAAATGGTCATTCTGAATCAATGGTTGAGTTAGCTGACCTATATACTCGCGCAGCTAGCGGAATAGAAGTTAATATTAAGAAAGCTTTAGAACTTCGTGAAAAGGCTGCAAAGTTAGGTAATAAGAAAGCAATGCGAAGTCTTTCCGTTATGTATCGTGATGGTATAGGCATTCCTAAAAATACTGATTTAGCTCAAAGTTGGTGGGATAAGTCTGAAAATTAATTTTAGTAAAAGTTAAGGATTTTTTAAGAGTCTAATAAATGAATAGTGTTCATGAAATTATAGAAAAAATACATAATGAATGGGAGATTGAACCTAAGAAAGCTATTCAGAGAGGTATGGAATGTCCCTTTCCTTTGCAGTGTTCATTAAACCTAAAAAGCAAAATTTACCCACAAATACCCCAAGTACTTTTACCAAAAGTATTAGAGGATTTTTATACAGTATCTAATGGTGCAGATCTATTTAAAGATCAAGAATACGGACAATGGGGCTTAAAATTATATTCTATTGAGGAAGTAATTTTTGCTTCAAAAATTTATAAAAGTAATCGTAATAATGACGCTCTTCAAAGTGACTTAATTATTGGAGAATTCTATGGAGATTCAGACTTATTGTTAGTTCGATGTGATCCTAATAGTGATGATTACGGCTCTATTTTTGTTGTCTTACCTATAGATCAAAGACAAGATTGGTACATTGTTGCTAATACTTTTCAAGAATTCATTAGTAAATTTTATGAAACTCAGGGTGATAAATTCTGGGAACCTTAAAATTATATGGCCGATATAAGCTCTATTTAACTTATGTTCACCTAAAATTTATTTAATTATCACAAAAATAGTATCATTTTTTGAATTTCATATAACGCCCATTATGTTAAATGGCTTAATCAAAATAACCTAAAATTGCTAAAGTAGGTATAAAAAAGATCGTAAATACAGCAAAATATTGAATCATCTTTTTATGTCTGAAATCTCGTTTAGAAATGATCAAAAGCAATGCCGAAAAAAAAATGACTATGAATGTCCATGAAATTGAATACATACTTAAATCCTAATTAATTATGTTGAGTTTTAAATTTCTCTAAAATTAACATAATACACCTTATACGAAATGCTTTTAATTTCTCTTGAAAATTCAACGCTGTAGTTTCTAGTTTATAGCCCATCTTCCCCAAGGTGGGTTTTTTTTATGATTTTTCACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTC LN:i:4149 RC:i:248525
|
||||
S 333 AAAATTACCAATCCTGATCTGTTTGCTACCAATGATCAGATTCAACAAAAAGCCCAGGATTAACCTGGGCTTCGTATAAGGCGTATTATGTTAATTTTAATGGAACTTTATAATTAGACGTTGTTGTTGATATATAGCGAATGCCATCATGGTGGGCACTATATTTTATGTAGATTGAATACCCACAATCTAAATTACACCATTTTTCAGAGTACAGACTTTCTCTTACATCATCTGGGCTTCCCAATAGATCATGAATTTCAACCTTAGACATGTCAAAATTAAGTCCTTTAGGTAGAAGCTCTCTATATTTATTAAATCCTTCAACATTTTGATTATAAAAAAATATTGTTTGAAAAAATAAGTTTTTACCTAACATTGCTGTATTACTAAGATGATCTAAGCAACTAGCTTCATCTCGAAAAAGTAAACAAATACCTTTTTCTTCATTAGCTATATATAAAGTATCTTCATCTAAAGGTTCATTATCCTCATCTAATAAAGGAATATAAAGATTAGGTTCTGTAATTAATTGATTATTTAATAAAAAATAAATCAACTCCTTATCATGATTACTTGTACCCAAAAATTCTAAAAGCTTTTCGGTCATAATATTTTTCTATATTTTCGACTGACAAATTTTAGAAGAAATCCCATGTAAAAGTCATCTACATCGACCCATTTAACATAATAGAGCTTATGCGAAACCACTCTTAATTATTTTAATTAAATCAATTGTTTGTGAATTTTACATAACGCAAAAAAACACCAAAAAGCCGACTTAGAAACAAGTCGGCTGCTTTTTAAGCAAGTTTGTCATATTCTGCCAATAAAATCGGTTAAAAAATGCTCAAAATTTCAATTTTTCCTGATTTCTAAAGGCTCTCTTTTCTGAAATTATCCCCATTTCCTGCGGACAATTCTTGGGATAATTTGAGGATATTTCCTTGTACGTTCTGGCGTACAACAATTCAGGAAAATTGCCCCAATTGGTTGTGCGGTAGATTTTGTGTGTAGGCTTCTTCATTTGAAAATTATATCGCTGAAAAAGCCTTTACAGATAGGTTTGTGCAACAAAGCCTTTCATAAATAAAACTAAAATTTAATGTAAAAGCGTTGAGAATATCCTGCACCATTTTCCCATTTATCAGTCTCTATGTCTAAAGATAGGTTCTTTACTGTTGTCTGTTTATTCGTTTTGTTTTTTATAGCATTTTTTAAAACAGATGGGAGTTCTTGATTAGCAACTTCCTTGTATATGCTTTCAGACATTTTTATAAATTCAGTCTCTGCTGATTTCTTATATTTTGAGTCATTAACATTTAAAACAACTTTCAATTCTTTAATACTACTTTCAGTACCTTCTACATAATAAGCAAGATTATTTGGATTTTCTCCATTTTGAGGATCAGGAGAAATATCAATATATGAAGAAATACAACGATAGGTACCTTCTCCCATTGTGTCTTTCCAGCTAGTCATACCTGGTAAATTCAGCATAGAACATGCTTCACTCACCGATATACTAATGTCTTCTGCTTCTACGTTTCCTGTATCTTTACTTTCTGCTTCTACATTTCCTATATCTTTACTTTCTACTTCCACGCTTTTTTCCTCTACTTCTTTATTCTGAATCGAAGAATCACAAGCTGAAAAAAATAAAGGTAATAACATCACAGCAATAAATTTATTTCTCATAACAATCCTATAAGCTACTAATATTAATTAATTTTTAACTGTAATTTTCCAGTTACCACCAACATCTGCTAGTTCGTATCTATAATTGTAATTATTACACACTAAATGATATCCAACACTTCTAACAAAAGGTAAAGTTGATGTTACAGAATCACACTTATAACCATTTAATTTAACCATGACTGCCATTTGCTTAGCCCCTTGTACTCCTGTAGCTTTATCAATTTTGGCACCCGATTCAATTTTTACTGATTTCATCATCTCTTGTAATACTTTTTCTTGTAATGGAGTTGCAAAAGTATTTACTGAGCATAATGTAGAGATAGCTAATACAATTTTGATAGATAATTTCATGATTATTAAGTAATTAAAAAAATGATATATTATATATAGTTAAAAATTTAGCAATATTTATATTGGTAAGTCCTTTATGGTTCTAGGCGGTTGGCGGGTTTCGGGGGGCTTGTCCCCCTGAACAAGGTCACGATAGGTAACTTTGAAAATAACGCAGTTATTTGAAAAAGTACCTATTGTGTATCCTTGCTTATTTTTGAATAAGCACTAGTTTAATATAATATTTAAATAAATTCCTGTGTAGGTTTGTTTTTGGGTTTTAGTTAGTGAACTATTTTTTATAATTGAACAAGAAAAAGGAAATATAAAAAATATAGCTCAGGTATTTGAATATCACCATTGAAAAACAAGGCTTCACGGTATGGAGCCGCTGTCGCTTTCAAATCGGTATGCTTGGGTTGCGTTACAGTCCAAGTCACAAGAAAGTATTTAGAAGTCTGACTACTTCCCCTGCACACGCATTATCTCTCTACAACACATGCTCGTATCTTGTGCCGATCATCTCGTACATCTATTTACGATTACATGCGCTATTCCCTGCGTCCTTTACCTAGTGTAATACATCAGTGGTAAAGCGGTTTTAGTCTTTAATGCCTGCCGTCGCTTGTCCACCTTCACATTAAAGCCACCAAAGACTAAGCCGATATATAAAAATATCGCAGTACCCATTGATTCGCATAGTCTCTATTTGCCGCCACCTCATTCCCAGCAAAATCATTTTTAATTTTATTTGTTGTCACTGCTTTTTTAGGAGAACTTTCTACTATCTTTTCCCAGTCAAATAGTGCCTCACCACTGTCTATCCATGTAGTAAAATCATCATAAATCATAATATTTTTTATGGCTTCTTCTTTATTTGCAAAAGTAAATGATGATGTAGTTAATAATAATGTCGGGATCAGAAGGTATTTCATAAAAAGATACTCTAAAATTAATTTTAAGTTAGAATCTTATAAAACAATGACTTAATTAAAAGCAATTTTTTTGCTTTAGGGAATAGTGTATGTCTTTAATAAATTGTCCAGAATGCAACCATTCTGTTAGTGATAAAGCCTTAGATTGTCCATCTTGTGGGGCTAGATTAAGGAAACCTAAAAGAGGTTTCTTTGGTAAGATTTTTAAATGGCTCTTTATTCTTTTTAACCTTTTTATGCTAGTTATGACATTTAAATCTTGTGGAAGTGCTTCAGAAATTATTGCTTCTAGCCAAAATGAATATGAACAAGCAGGAGCTACATTAGGCTCTACATTAGGCTTAGGGATGGTGATAATTTTCTGGGCATTAGTAGATGTAATACTTGGATTATTTGTACTATTCACACGACCAAAGCGATAAAAATAGGGAGATATTCTCCTTATTTTTCAAATAGCTAAGCGTAAATTAAGGGCTTTAACAACTTTTATAACAGTGTCAAAACTAGGATTAACATCCCCAGATAGAGCTTTGTAAAGACTTTCTCTTCCTAAGCCAGTTTCTTTTGATAGTTGAGTCATACCTTTAGCTTTAGCAATGTTTCCTAGTGCTTTAGCGATAAAGGCTGCATCACCGTTGGATTCATCAATACATGCTTGTAAGTAAGCTTGCATATCCTCTTCTGTTTTGAGGTGCTCTGCACTATCCCACTTACGAAGTTTAATAGCCATTTATAGTTCCTCCTCTAGGTCTTTTGCTAGCTTCAAGGCGAGTTTTATATCTTTACTTTGGGTGGTTTTATCACCGCCTGCTAATAAAATAATAACTCTTTGTCCTTGTTGGCAATAATAAACTCTATAACCAGGACCGAAGAAAAAACGTAGTTCAGAAACACCTTCGCCCACAGGCTCAGTATCTCCAAAATTGCCATCTTCAACCCGATCGATACGAACTTGTATGCGTCTTTTTGCCTGTTGGTCTTTAAGTTTGTCAAACCAATCATCAAAGACTTCGGTGGTGTATATTGAGTACATAAGCAAAATGTATCCTATAGGATACATTTGTGCAAGACATAAACTTGGTTTTTTAGGCCTGAGGCACTGATGTTATTCCCAAGTAGAAAAGTCCTACCTAATAACTAAATAGAAATGTCCTATACAGGCATTTCCGGCCAGTGAGCCTATCAAAGTCCTCATGCATGAAGCCAAAGAAGAAATAATTTATTATTGAGTTTCGCTAAAATTAACATAATACACCTTATACGAAATGCGTTATAATTATCTATAGAAATCAATGCTGTAGTTTCTAGTTTATAGCCCATCTTCCCCAAGGTGGGTTTTTTTTATGATTTTACACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTC LN:i:4399 RC:i:191002
|
||||
S 6 GAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTT LN:i:89 RC:i:9779
|
||||
L 6 + 277 - 81M
|
||||
L 6 + 280 + 81M
|
||||
L 232 + 277 + 81M
|
||||
L 280 + 232 - 81M
|
||||
L 282 + 6 + 81M
|
||||
L 283 + 6 + 81M
|
||||
L 289 + 282 + 81M
|
||||
L 289 + 283 + 81M
|
||||
L 297 - 232 + 81M
|
||||
L 297 + 289 + 81M
|
||||
L 333 - 232 + 81M
|
||||
L 333 + 289 + 81M
|
@ -28,7 +28,7 @@ from Bio.SeqRecord import SeqRecord
|
||||
# warnings to stdout and verifying via the print-and-compare check. However,
|
||||
# there was some frustrating cross-platform inconsistency I couldn't resolve.
|
||||
|
||||
possible_unknown_seq_formats = {"embl", "genbank", "gb", "imgt", "qual"}
|
||||
possible_unknown_seq_formats = {"embl", "genbank", "gb", "imgt", "qual", "gfa1", "gfa2"}
|
||||
|
||||
# List of formats including alignment only file formats we can read AND write.
|
||||
# The list is initially hard coded to preserve the original order of the unit
|
||||
@ -1844,6 +1844,61 @@ class TestSeqIO(SeqIOTestBaseClass):
|
||||
messages,
|
||||
)
|
||||
|
||||
def test_gfa1(self):
|
||||
sequences = [
|
||||
"GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCT...ACATTAT",
|
||||
"TATAATAAACACCCTCACCACTACAATCTTCCTAGGAACA...CTACTCT",
|
||||
"GGGGTAAATGATGGGTTGGGCCAAGGGGTTAATTAGTACG...TATTAAG",
|
||||
"ACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGC...TCTGAGC",
|
||||
"ATTCTACCACTCCAGCCTAGCCCCCACCCCTCAACTTGGA...TATAAAC",
|
||||
"CTTTTACCACTCCAGCCTAGCCCCTACCCCCCAATTAGGA...AGCCCAA",
|
||||
"TTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCC...AAAGACC",
|
||||
"ACATCATCGAAACCGCAAACATATCATACACAAACGCCTG...CACGATG",
|
||||
]
|
||||
ids = [
|
||||
"MTh0",
|
||||
"MTh4001",
|
||||
"MTo3426",
|
||||
"MTh4502",
|
||||
"MTo8961",
|
||||
"MTh9505",
|
||||
"MTh13014",
|
||||
"MTh13516",
|
||||
]
|
||||
names = ids
|
||||
lengths = [4001, 501, 501, 5003, 502, 3509, 502, 3053]
|
||||
molecule_types = {
|
||||
"embl": "DNA",
|
||||
"genbank": "DNA",
|
||||
"imgt": "DNA",
|
||||
"seqxml": "DNA",
|
||||
"nexus": "DNA",
|
||||
}
|
||||
alignment = None
|
||||
messages = {
|
||||
"fastq": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||
"fastq-illumina": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||
"fastq-solexa": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||
"nib": "More than one sequence found",
|
||||
"phd": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||
"qual": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
|
||||
"sff": "Missing SFF flow information",
|
||||
"xdna": "More than one sequence found",
|
||||
}
|
||||
self.perform_test(
|
||||
"gfa1",
|
||||
False,
|
||||
"GFA/seq.gfa",
|
||||
8,
|
||||
ids,
|
||||
names,
|
||||
sequences,
|
||||
lengths,
|
||||
alignment,
|
||||
messages,
|
||||
molecule_types=molecule_types,
|
||||
)
|
||||
|
||||
def test_nexus1(self):
|
||||
sequences = [
|
||||
"A-C-G-Tc-gtgtgtgctct-t-t------ac-gtgtgtgctct-t-t",
|
||||
|
77
Tests/test_SeqIO_Gfa.py
Normal file
77
Tests/test_SeqIO_Gfa.py
Normal file
@ -0,0 +1,77 @@
|
||||
"""Tests for SeqIO GFA module."""
|
||||
import unittest
|
||||
|
||||
from Bio import BiopythonWarning
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
class TestRead(unittest.TestCase):
|
||||
def test_read_GFA1(self):
|
||||
"""Test parsing valid GFA 1.x files."""
|
||||
records = list(SeqIO.parse("GFA/seq.gfa", "gfa1"))
|
||||
self.assertEqual(len(records), 8)
|
||||
self.assertEqual(records[6].id, "MTh13014")
|
||||
self.assertEqual(
|
||||
records[6].seq,
|
||||
"TTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGACC",
|
||||
)
|
||||
self.assertEqual(records[0].annotations["SN"], ("Z", "MT_human"))
|
||||
self.assertEqual(records[0].annotations["SO"], ("i", "0"))
|
||||
|
||||
records = list(SeqIO.parse("GFA/seq_with_len.gfa", "gfa1"))
|
||||
self.assertEqual(len(records), 9)
|
||||
self.assertEqual(
|
||||
records[8].seq,
|
||||
"GAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTT",
|
||||
)
|
||||
|
||||
records = list(SeqIO.parse("GFA/fake_with_checksum.gfa", "gfa1"))
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0].seq, "AAA")
|
||||
|
||||
records = list(SeqIO.parse("GFA/no_seq.gfa", "gfa1"))
|
||||
self.assertEqual(len(records), 9)
|
||||
self.assertEqual(len(records[0]), 528)
|
||||
|
||||
def test_read_GFA2(self):
|
||||
"""Test parsing valid GFA 2.0 files."""
|
||||
records = list(SeqIO.parse("GFA/fake_gfa2.gfa", "gfa2"))
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0].seq, "AAA")
|
||||
|
||||
|
||||
class TestCorrupt(unittest.TestCase):
|
||||
def test_corrupt_gfa2(self):
|
||||
"""Check a GFA 1.x file does not parse in GFA 2."""
|
||||
with self.assertRaises(ValueError):
|
||||
list(SeqIO.parse("GFA/seq.gfa", "gfa2"))
|
||||
|
||||
def test_corrupt_segment_fields(self):
|
||||
"""Check a GFA file with invalid fields on a segment line."""
|
||||
with self.assertRaises(ValueError):
|
||||
list(SeqIO.parse("GFA/corrupt_segment_fields.gfa", "gfa1"))
|
||||
|
||||
def test_corrupt_len(self):
|
||||
"""Check a GFA file with an incorrect length."""
|
||||
with self.assertWarns(BiopythonWarning):
|
||||
list(SeqIO.parse("GFA/corrupt_len.gfa", "gfa1"))
|
||||
|
||||
def test_corrupt_checksum(self):
|
||||
"""Check a GFA file with an incorrect checksum."""
|
||||
with self.assertWarns(BiopythonWarning):
|
||||
list(SeqIO.parse("GFA/corrupt_checksum.gfa", "gfa1"))
|
||||
|
||||
def test_corrupt_tag_name(self):
|
||||
"""Check a GFA file with an invalid tag name."""
|
||||
with self.assertWarns(BiopythonWarning):
|
||||
list(SeqIO.parse("GFA/corrupt_tag_name.gfa", "gfa1"))
|
||||
|
||||
def test_corrupt_tag_type(self):
|
||||
"""Check a GFA file with an incorrect tag type."""
|
||||
with self.assertWarns(BiopythonWarning):
|
||||
list(SeqIO.parse("GFA/corrupt_tag_type.gfa", "gfa1"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = unittest.TextTestRunner(verbosity=2)
|
||||
unittest.main(testRunner=runner)
|
Reference in New Issue
Block a user