Add SeqIO support for GFA files. (#4598)

* Adds tags from segment lines into the annotations dictionary of
  the SeqRecord, handles undefined sequences and sets their length
  according to LN tag, raises a warning on incorrect tag types, and
  adds more links to the format documentation.

* Check for valid tag name and store tag types in annotations

* Tag value may contain colons

* Update NEWS.rst
This commit is contained in:
Michael M
2024-01-26 08:27:18 -06:00
committed by GitHub
parent 54e88206d9
commit ea1fbfedca
15 changed files with 391 additions and 1 deletions

177
Bio/SeqIO/GfaIO.py Normal file
View File

@ -0,0 +1,177 @@
"""Bio.SeqIO support for the Graphical Fragment Assembly format.
This format is output by many assemblers and includes linkage information for
how the different sequences fit together, however, we just care about the
segment (sequence) information.
Documentation:
- Version 1.x: https://gfa-spec.github.io/GFA-spec/GFA1.html
- Version 2.0: https://gfa-spec.github.io/GFA-spec/GFA2.html
"""
import warnings
import hashlib
import re
from Bio import BiopythonWarning
from Bio.File import as_handle
from Bio.Seq import Seq, _UndefinedSequenceData
from Bio.SeqRecord import SeqRecord
def _check_tags(seq, tags):
"""Check a segment line's tags for inconsistencies (PRIVATE)."""
for tag in tags:
if tag[:2] == "LN":
# Sequence length
if len(seq) == 0:
# No sequence data, set the sequence length
seq._data = _UndefinedSequenceData(int(tag[5:]))
elif int(tag[5:]) != len(seq):
warnings.warn(
f"Segment line has incorrect length. Expected {tag[5:]} but got {len(seq)}.",
BiopythonWarning,
)
elif tag[:2] == "SH":
# SHA256 checksum
checksum = hashlib.sha256(str(seq).encode()).hexdigest()
if checksum.upper() != tag[5:]:
warnings.warn(
f"Segment line has incorrect checksum. Expected {tag[5:]} but got {checksum}.",
BiopythonWarning,
)
def _tags_to_annotations(tags):
"""Build an annotations dictionary from a list of tags (PRIVATE)."""
annotations = {}
for tag in tags:
parts = tag.split(":")
if len(parts) < 3:
raise ValueError(f"Segment line has invalid tag: {tag}.")
if re.fullmatch(r"[A-Za-z][A-Za-z0-9]", parts[0]) is None:
warnings.warn(
f"Tag has invalid name: {parts[0]}. Are they tab delimited?",
BiopythonWarning,
)
parts[2] = ":".join(parts[2:]) # tag value may contain : characters
annotations[parts[0]] = (parts[1], parts[2])
# Check type of the tag and raise warning on a mismatch. These RegExs
# are part of the 1.0 standard.
if parts[1] not in "AifZJHB":
warnings.warn(f"Tag has invalid type: {parts[1]}", BiopythonWarning)
elif parts[1] == "A" and re.fullmatch(r"[!-~]", parts[2]) is None:
warnings.warn(
f"Tag has incorrect type. Expected printable character, got {parts[2]}.",
BiopythonWarning,
)
elif parts[1] == "i" and re.fullmatch(r"[-+]?[0-9]+", parts[2]) is None:
warnings.warn(
f"Tag has incorrect type. Expected signed integer, got {parts[2]}.",
BiopythonWarning,
)
elif (
parts[1] == "f"
and re.fullmatch(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", parts[2])
is None
):
warnings.warn(
f"Tag has incorrect type. Expected float, got {parts[2]}.",
BiopythonWarning,
)
elif parts[1] == "Z" and re.fullmatch(r"[ !-~]+", parts[2]) is None:
warnings.warn(
f"Tag has incorrect type. Expected printable string, got {parts[2]}.",
BiopythonWarning,
)
elif parts[1] == "J" and re.fullmatch(r"[ !-~]+", parts[2]) is None:
warnings.warn(
f"Tag has incorrect type. Expected JSON excluding new-line and tab characters, got {parts[2]}.",
BiopythonWarning,
)
elif parts[1] == "H" and re.fullmatch(r"[0-9A-F]+", parts[2]) is None:
warnings.warn(
f"Tag has incorrect type. Expected byte array in hex format, got {parts[2]}.",
BiopythonWarning,
)
elif (
parts[1] == "B"
and re.fullmatch(
r"[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+", parts[2]
)
is None
):
warnings.warn(
f"Tag has incorrect type. Expected array of integers or floats, got {parts[2]}.",
BiopythonWarning,
)
return annotations
def Gfa1Iterator(source):
"""Parser for GFA 1.x files.
Documentation: https://gfa-spec.github.io/GFA-spec/GFA1.html
"""
with as_handle(source) as handle:
for line in handle:
if line == "\n":
warnings.warn("GFA data has a blank line.", BiopythonWarning)
continue
fields = line.strip("\n").split("\t")
if fields[0] != "S":
continue
if len(fields) < 3:
raise ValueError(
f"Segment line must have name and sequence fields: {line}."
)
if fields[2] == "*":
seq = Seq(None, length=0)
else:
seq = Seq(fields[2])
tags = fields[3:]
_check_tags(seq, tags)
annotations = _tags_to_annotations(tags)
yield SeqRecord(seq, id=fields[1], name=fields[1], annotations=annotations)
def Gfa2Iterator(source):
"""Parser for GFA 2.0 files.
Documentation for version 2: https://gfa-spec.github.io/GFA-spec/GFA2.html
"""
with as_handle(source) as handle:
for line in handle:
if line == "\n":
warnings.warn("GFA data has a blank line.", BiopythonWarning)
continue
fields = line.strip("\n").split("\t")
if fields[0] != "S":
continue
if len(fields) < 4:
raise ValueError(
f"Segment line must have name, length, and sequence fields: {line}."
)
try:
int(fields[2])
except ValueError:
raise ValueError(
f"Segment line must have an integer length: {line}."
) from None
if fields[3] == "*":
seq = Seq(None, length=0)
else:
seq = Seq(fields[3])
tags = fields[4:]
_check_tags(seq, tags)
annotations = _tags_to_annotations(tags)
yield SeqRecord(seq, id=fields[1], name=fields[1], annotations=annotations)

View File

@ -291,6 +291,10 @@ names are also used in Bio.AlignIO and include the following:
- gck - Gene Construction Kit's format.
- genbank - The GenBank or GenPept flat file format.
- gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
- gfa1 - Graphical Fragment Assemblyv versions 1.x. Only segment lines
are parsed and all linkage information is ignored.
- gfa2 - Graphical Fragement Assembly version 2.0. Only segment lines are
parsed and all linkage information is ignored.
- ig - The IntelliGenetics file format, apparently the same as the
MASE alignment format.
- imgt - An EMBL like format from IMGT where the feature tables are more
@ -380,6 +384,7 @@ from Bio.SeqIO import AbiIO
from Bio.SeqIO import AceIO
from Bio.SeqIO import FastaIO
from Bio.SeqIO import GckIO
from Bio.SeqIO import GfaIO
from Bio.SeqIO import IgIO # IntelliGenetics or MASE format
from Bio.SeqIO import InsdcIO # EMBL and GenBank
from Bio.SeqIO import NibIO
@ -420,6 +425,8 @@ _FormatToIterator = {
"gck": GckIO.GckIterator,
"genbank": InsdcIO.GenBankIterator,
"genbank-cds": InsdcIO.GenBankCdsFeatureIterator,
"gfa1": GfaIO.Gfa1Iterator,
"gfa2": GfaIO.Gfa2Iterator,
"imgt": InsdcIO.ImgtIterator,
"nib": NibIO.NibIterator,
"cif-seqres": PdbIO.CifSeqresIterator,

View File

@ -63,11 +63,17 @@ The wwPDB organization announced that they plan to deprecate the FTP
server by the end of the year. See the announcement
`here <https://www.wwpdb.org/news/news?year=2023#65562f0ad78e004e766a96c1>`_.
``Bio.SeqIO`` now supports reading sequences from the Graphical Fragment
Assembly (GFA) files with the formats ``gfa1`` and ``gfa2`` (for GFA 1.x and
GFA 2.0 files respectively). All data outside of segment lines are ignored, such
as linkage information.
Many thanks to the Biopython developers and community for making this release
possible, especially the following contributors:
- Anil Tuncel (first contribution)
- Fabio Zanini (first contribution)
- Michael M. (first contribution)
- Michiel de Hoon
- Peter Cock
- Will Tyler (first contribution)

View File

@ -0,0 +1 @@
S fake AAA SH:H:FFFFFFFFFFFFFFF69566510EE712661F9F14B83385006EF92AEC47F523A38358

View File

@ -0,0 +1 @@
S fake AAA LN:i:2

View File

@ -0,0 +1 @@
S fake

View File

@ -0,0 +1 @@
S fake AAA ~~:i:0

View File

@ -0,0 +1 @@
S fake AAA AB:i:C

1
Tests/GFA/fake_gfa2.gfa Normal file
View File

@ -0,0 +1 @@
S fake 3 AAA

View File

@ -0,0 +1 @@
S fake AAA SH:H:CB1AD2119D8FAFB69566510EE712661F9F14B83385006EF92AEC47F523A38358

21
Tests/GFA/no_seq.gfa Normal file
View File

@ -0,0 +1,21 @@
S 232 * LN:i:528 KC:i:51170
S 277 * LN:i:893 KC:i:50561
S 280 * LN:i:895 KC:i:37634
S 282 * LN:i:1819 KC:i:106341
S 283 * LN:i:1854 KC:i:82138
S 289 * LN:i:163 KC:i:14624
S 297 * LN:i:4149 KC:i:248525
S 333 * LN:i:4399 KC:i:191002
S 6 * LN:i:89 KC:i:9779
L 6 + 277 - 81M
L 6 + 280 + 81M
L 232 + 277 + 81M
L 280 + 232 - 81M
L 282 + 6 + 81M
L 283 + 6 + 81M
L 289 + 282 + 81M
L 289 + 283 + 81M
L 297 - 232 + 81M
L 297 + 289 + 81M
L 333 - 232 + 81M
L 333 + 289 + 81M

19
Tests/GFA/seq.gfa Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,21 @@
S 232 CCTTATACGAAGCCCAGGTTAATCCTGGGCTTTTTGTTGAATCTGATCATTGGTAGCAAACAGATCAGGATTGGTAATTTTGATGTTTTCCTGACAACTCCTGCAAAGCATCAGCCCAGCAAAAAGTTGTACATGTTCCGTTGATTCACAGAAGGCACATGGCTTAGGAAAAGAGATGATATTGGCTGAATTGACTAATTCTGTTGACATAAGAAGTAACCTTGGATTGTACATATTTATTTTTAATAAATTTCAATACTTTATACCTAATTTAGCCACTAAAATTTGTACATTATTGTATAATCCATGTGTACATATTATTTTTTATAATTTTCATTCACTTAGACCCAAAATAGTATTTATTTTTGTACAACACCATGTACAGAACAATAATCATATAAATCAAATTTTTAGCATAAAAAAGTCCATTAATTTTGTACACAATTCTGAAACTTAAAAATCTAAACTTTCATCAATTTTTTTCATAATTTCAATAAATTAACCTTAATTTTAAGATATATTCTGAAA LN:i:528 RC:i:51170
S 277 TGAAACTTAAAAATCTAAACTTTCATCAATTTTTTTCATAATTTCAATAAATTAACCTTAATTTTAAGATATATTCTGAAATTTGGTTTGAAAGCTGTTTTTACATTATATTTCAATACTTTAAATCAAAAAATTGGATATTTTTTGAAAAACTCAATGAAAGTTTATTTTTTATTTAAAAACAACTAGTTATATTAGTTTTTATCCATTTTTTTGAAACAGTTTCTATATGATAAAAAAACCTATAAAAACCATATCTAGCAAAGGTTTGAGGGTTATCATCTTTAGATGCGTGGTGTGTGACAAAAAAATCCCGGCATGTGCCGGATTCTGGATTAGAAAATTGGCTAAAGTGACGTAGGACTGGTGCTTGGTTTTACATGGAAAAAAGTATTTATTTTCTGGTTTATAAAAACGTAAAAAGATCAGTTTTTGTTCATTCATCCAGGTTAAAAATTTCAACCTAAAACTTTAATTATGAAAAGCTTCACAGAAAGCATTCAAATGCGATTTAAGAGCCTTTATCTAAAAAACATAGATCTTATAGCGAAAAACAGAAAACAGCTCAAAAAACGCAAAAGAGAGTGAAGTAAAGAGATGTTTTGACTTTAGATAGCTGCATAGAGCGAGTGTCTACGAGCGAACTATCAAAATTTGCGTCTAGACTCTCTGAAAAACATTTTTTTGCCCTCTTTAGCCTAAGAAAGCTTAATTTTCATGCAGAAATTTGCTCCTGGACCGAGCGTAGCGAGAAAAAAAGCTCATGAGCGAAGCGAATTCCGAGTTGCTTTTGCTTTTTCTTAAAGTCACGCAAGTATTAACCAAAAAATTGCCCCGACGAACTGAGCGAAAGCGAAGTTCAATAGAGTTTGAGCGAAGCGAAAACCAAGGGC LN:i:893 RC:i:50561
S 280 GCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTTACGTGATTTTAAGAAAAAGCAAAAACAACTCGGAATTCGCTTCGCTCATAAAACTTTTTTACTCGCTACGCTCGGTCTAGGAACAAATTTCTGCATAAAACTTACGCTTTCTTAGGCTAAAAAGCCCCAAAAACTGTTTTTCAGAAAGGCTGAGAGGCAAATTTTATAAGTTCGCTCGTAGACACTCGCTCTATGAATCTATCTAGAATCAAAATCCCTCTTTATTTCGCGCTCATTTGCGTTTTTTGAGCTGTTTTCTGTTTTTCACTATAAGACCTATGTTTTTTACTTAAAGGCTCTTAAATCGCATTTGAATGCTTTCTGTGAAAATTTCCATAATTAAAATTTTAGTTTGAAATTTTTAATTTGGATGAACGGACAAAAAATGATTTTTTTACGTTTTTCTAAACCAAGAAATAAATACTTTTTTCTATATAAAACCAAGCACCAGTCCTACGTCACTTTAGCCAATTTTCTAATCAAGGATCCGGCACATGCCGGGATTTTTTTGTCACACACCACGCATCTAAAGATGATAACCCTCAAACCTTTGCTAGTTATGGTTTTTATAGGTTTTTTTATCATATAGAAACTGTTTCAAAAAAATGGATAAAAAATAATATAACTAGTTGTTTTTAAATAAAAAATAAACTCTCATTGAGTTTTTCAAAAAATATCCAATTTTTTGATTTAAAGCATTGAAATATAATGTAAAAACAACTTTCAAACCAACTTTCAGAATATATCTTAAAATTAAGGTTAATTTATTGAAATTATGAAAAAAATTGATGAAAGTTTAGATTTTTAAGTTTCA LN:i:895 RC:i:37634
S 282 AGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCATGTTTAGGATCATCAATATTAGGTTCTGGCTGTACTTGAGCTACATTCGCTTGCGACTGTTCTTGATCGGTAAACGTAGTCATATTGGCTTTTGGTGCTTCTAGTAAGCGCTGCATGGCCTCGATCTGATCTTGATAAAACGATTCACGTTCTAGTGATTGATTTTCTCTAAACAGTGCTTGATTTAACTGTTTTTCCAGTATGTCAACTTGACGTTTTAATAGGTCAACTTCTGTCAAGTTTTGACTGTTAATTGATTGACTTTGATTGACATTAGTATCTTTCTTTTGTGGTTCCCCAAAAACTCTTATGGCTTCTGCGAAGTCAATTAATCCATCAGATCCTTTAGATAAATTTCCTTTGTTTATATGGGCATATATGGCTTGTCGTGAATATCCGTAAAGCTTAGCCAACTCTGAAACTGACAGTTTTTTCATTATGTAAACCAGTTTTCAATTTGTGTTAATACTTGACTGTCAACTTTACAATGTCAATTGACATCATTTATTGAAAGCCAACTTTTCGAAGTAATGGAGTAAGTTCTTTTAATTTCTCCGGCTGTTTTAGCATATCTGCAATCCGTACAGCAAATTGTTCATAGCTTTCGGTGCCTTGTGAAAGTTTTCCCATTTCTGGAAGCTCTGACAATTTACTCGCAAATGAATAGCGTTGTGCATCTGTTAAAGTGATCGTTATATCTTCGCTTACACCTATACTTTTAGCTGACTTCGTTACTGATTTTTTTTGTTTAAAGCTAAATGAAAAACCTGTAATTGATCTACCAGTCTTATGCTGTTCAACTTTAACAGTAATGTCGGTATGTTCATTTACTTGCTTTAATGCAATGTCTAAGACATATTTTTTAAAGTCATACATTCGTTTGTATTCAGTATCGAGTACACCTATTTTTTGTCTGAAATCAGACAAAGTTATAAGAGGCGTTTTTCCTGTACTACGCCATGCAATCAATATCTCATATAACCGAACGGCATAAGCACTTGTTAAATTGCTTATTTGTTGTATTTCATACTTTGTAAATTGTTCTTCTAGTCTTGTAATTAAAGGCACGATAGCGGGAGCAAAAATAAGTCTAACAACCGCTTCATTATCAATATAAGCGACTTCACTCACCCATCTTGACTTGTGATTAATAATGTTGCCTTTTTCACTAAGACTTTGATAACTGAATTGTCTTGCAAATAGATCATCACAAGCATCTTTTAATGCCTGATAAGCCGTGTTTCGATGTACACCAAATTGATTGATGTAGCTTTCAGCATGAACTGTTAATGGATCATTAGCATTTATTCCCTTACCCGATTCCCTTGCTTCAACGATAGCTAGAAGAATTAACCGTTGCTCGACTAAATCAAGGTTATAACTGGCATTAATTAATGCATTATCTTTAACTATTAGTTCTGTTTTCATATGAAAAGGTTATATTTTTAATTCAATAAGACGAGGCTATATCATAAACTCGCTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTAATGCAAGGTAAACCTCGTTTTATAGTCTCTAAGAATTTTATAAATAAAGGCTTTCAGCATGTCTAAAAGCATTTAAAAGATTTAAAAATATTTAAAAGCTCAGGTCATGAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGT LN:i:1819 RC:i:106341
S 283 AGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCATCTTTAGGTGGAACAACTATCACCTGCTCCTTCATTTCTACAATGGGAGCTGATTGAGGGAGTTCGGGTTCACTAGGCTCTTGGATCTCAGGTTTAGGTGCTTCCAGTAGACGTTGCATTGTCTCAATTTGATTTTGATAAAAGGATTCACGCTCTTTTGCATCATCAAGCTGATTTTGCAGCATGTTTATCTGTTGTCTAAGTAGTAAAACTTCTGTCGATATTGGACTGTCTATTTTTACAGATTCTTTTACATCCTGTTTTTTTGAAGGTTCACCAAAAACACGGATAGCCTCGGCTAGATCAATTTTATTATCAGAATTTTTACTTAAAATTCCTTTCTTTATGTTGTTGTAAATCGTTTGTCTATTGATATTGTAAAGCTTAGAAAGCTCAAGAACTGTAAGGCTTTTCATGTTGTAAATCTATGTCTAAACCTGTCAAAAGTTGTTGTCTATTAGACTGCCTAATAGACAAATGATCAAGCATAAATTTTAGGCAGCTTGATAGCCAACTTTTTGAAGATAAGGAAATAATTCTTGGAATTTTTGATAATCTTGGAGCATTTCTGCAATGCGTATAGCAAATTGTTGGAAACTTTCGGTGCCTTCTGAATATTGCCCCATCTCAGGTAATTCAGAAAGTTTGTTGGCAAAGAGATGACGTTGTTTATCACTCATTTTGGTGAAAAGGGCTAATGTATCCGTTCCCTTAATTACTTTGTCAGAGTTGGTTTTTTTCTGCTTAAAACTGAAAGAAAAACCGGAAATTGAACGTCCTTTTTTATGTTGTTCATATTTAGCCTTTACATCGGTTAACTCATTTACTTGCTTGAGGGCAAAATCTAAAACATACGTTTTAAAATTACTCATTGTTTTGTATTGATGAGCTTCGACACCAAGTTGTTGTCTAAAGATCGATAGATCAAAAATTGGCGTTTTACCAGCAGATCTCCATTGAATTAATAACTCGTATAACCGAATAGCATAAGAGCTAGTTAATCTCGAAACCTGCTCCAATTCATATTTAGTAAAGTTTTCTTCCAGTCGAGTTATGAGTGGAACAATGTCTTGCGTGAAACGTAGGAAAACGATCCCGGATTGAGGTTCATATCCAATTTTATCGACCCAACGACAGTGAAAGCTACGATCCTTACCAGTCTTAGGATTAATGTCATGATACGTGACATAGCGATCAAACAAGCTTTTAGATGCATCCCTAAGCACGGTATAAGCAGTGTGTTTCTCGACATTAAAGGTATTGATATAACTACTTGCATGTACTTCTAAAAGGCTATTTTCAGTTATCCCGTGCCCTGTTTCCCGAGCTTCAGCAATGGCTAATAGGATCAGCCTTTGTTCAACTGTATCTAGGGTATAGCTAGCTTGAATTAAAGCATTATCCTTTACGATTAATTCACTCATTTCTAATAATAAATTTATAATTAAACTAGGTTGTACAATAGACTATTTTAATTGAATTGTAAATCGACTTAATTTGGGGTCGATAAACCTAGTTTTAAGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGTTTTAAGGGTCGATAAACCTAGGTTTATTAACTCTAAAACTATTATGAATAAAGGGTTACAGCTTACTTAAAAGCATTTAAAAGATTAAAAATAATTAAAAGCCTAGGTAAAGAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGT LN:i:1854 RC:i:82138
S 289 CACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTCCAGAAGCCACGCTTTTTAGGCTCTGGTTCAACATGCTCTGGAACAGGAATGCGCTTATTCTCTGGAGTAGTCAATCCGTCAT LN:i:163 RC:i:14624
S 297 AAAATTACCAATCCTGATCTGTTTGCTACCAATGATCAGATTCAACAAAAAGCCCAGGATTAACCTGGGCTTCGTATAAGGTGTATTATGTTAATTTTAGAAATTATTAAATGATTCCAAGATTTTCTAGCGACTTATAAGTAATTTCATTACGAATAGAACCAGACATTCCTTCTTTCATTTCTAAGTTGAGCGAAAAGGGGATTTTTTTTCCATTAGCTTGCTCCACCCAACCAGTCAACCAACCTACCTGTGGAGTAACATCCATTCCCCATCCACTTTTTGCATAAATCTTACTACCATTTACTTCTTTAATTAGAAGCATTTTTTTAACTTCTTCTTGAGTTTCTAATTTAAAAGGTAATCGGTTATGTGCAAGGTCATCGGCAAAATTAACTTCTTGTACTGGTGTAATTTTAAGGGGGCCAACTAACCAAAAATTATCGACCTGTGTTCCAATATTTGTATTTCCAAAATTAACCCGCTTTACTTCTTTCTGCATTAGCTCTAGGCCAGTCCGTCTTGCAAGCTCTTGATATACTGGAACTGCTGACAATGCCATTGCCTCACCTAAAGTCATATCTTTCTCCCACATAGGATAAGTTCTTTTTTTACCATCCCATTTGAAAATCTCATTTGTTGTTGCTTTATGATTTTCTAGCCCGATTAAAGCATTTAGCATCTTAAATGTTGATGCAGGGACATATTCTTTATTTGCTCGTGCAAGAGCATTACCATAGGTGCTAAGATTTTTACCCTCTTTAATAATAATTACACCCTGTGTTTGAGCTTCATCAAAATAGCTTTTAATAGCTTTTTCATGTTGCTGAGAAGAAATATGAAAATTATCTTCAGATTTAGTTTTAATAGATGAACATGCACTGAGAGAAACTAGAATAGAAATGCTGAATATAGGAAGTATAAATTTTTTCATTACAAATTCATGTTAGGGGAAATTTTGGGGCTTAGAGCATAATATTTTTATTAAAAAAGTTAAAACTCTTTTAATTTAACATAATGGGCGTTATCCGAAATGGCCCTTTAAGTTTACTTTAAGATCAACAACTTACATTGAGTTAAACCCCTGGAACGATCTCGTTCAACCGATTTAGAAATAAATCGGTTTTTTTATGTTCAATTCTGATACTTCTTGCCAATAAAATTGGTTAAAAAATGATCAAAAATACCAATTCCATAGAGTTTGTAAGCCTTTTTTACCCAAAAGAATCCACAATTTCTGCGGATAACCATTGGGATAAATTTTGATTATTTTTTGTACGTTTTGGCTTACAGCGTTTCACGAAATTGTCTGTATCTATCCCCCGATGAAGCTCTTATGATCGATCTATATGGACAGGATGTTCAATGACTTAAAACAACAATAATAATCAGTCAGGCACCATGACCATCAGGTAGTCAGCCCTACCTATGCGCTCCATACCGAAAAATCCCGACATGACGTCGGGATTTTTTATTGCCATTTGCAAAGTCATTCAATCAGATTTAATATATAGACTGTAATCTATATTGTGATTGTAATATGTGGACAGTCATTACAACGGATCTGTTTAATGAATGGCTTGAACAGCAAGATGAAGCAACACAAGAGAAAGTCTTAGCTGCATTGGTTGTTCTACAACAGCAAGGGCCTAGTTTAGGTCGTCCTTTAGTAGATACCGTCTACGATTCTAAATTTACCAATATGAAGGAACTACGTGTTCAGCATCGTGGTAAACCGTTAAGGGCATTTTTTGCATTCGATCCCCTAAGACAAGCGATTGTGCTGTGTATTGGTGATAAGAGTAATAAAAAGCGTTTTTATACAGAGATGCTAGCAATTGCAGATGAGCAATACGCACTTCATCTCTCAACTTTAGGAGATCAATCTAATGGCTAAGAGCTTACAAGAATTATTGGCTAGTCGCTCTCCAGAGAGTCAAGCTCGTATTCAAAAAATGGCTGATGAATTACTTTTAGAAAGTCAGCTTCACCTTATCCGAGAAGAGTTAGAAATTTCCCAGAAAGAACTAGCTGCAACCCTTGGAATTAAACAGCCGTCATTATCAGCAATCGAAAATCGTGGTCATGATCTTAAAATTTCAACGATGAAAAAGTATGTTGAAGCAATGGGTGGAAAACTTCGTATTGATGTAGAGCTTCCAACAGGAAAACACATAGGGTTCAATGTCTAATAATAAACCTTTGAAAAATAGTACTAAATTACTTGTTAACTTAGACAAGTTTATATTTTTAGTCAATGCTGCCGATAGCCTAGAAGAAATTGAAATTATTAGAGATTTATGCTGTGAATATTTTTCACATTGTAAGAGGCCAAGCTATTACATTGATATTTTTGACAATGCTTATTGGATAAAATATTACGAAATAGCTTAATCCTTCTAGATTCACATAGGAGTTTCTATGTTAAATAGGACTAAAAATATACTCTAATTTTAGAGTTTTCTTTTAGGTATGATGTAAAAACATACAAGCCTAAGAGTTTAATTTAAAGGTTAAGAATTGAATAAATTAGTAAATATTTTTATGGCTTTAGGAGTTTGCCTTTCAATGGAAACATTTGCAGAACCTGTTTTTTCTAATGATTTATTAGCGAAAGCAGAAAGTGGAGATACTTCTGCCCAGTTAGAATTAGCTGAGATTTATCTATATGGTCATGGTGTTGATTCAGATGAAAATCAAGCTGAAATTTGGGCTATTAAATCAGCTGAAAATGGAAATGTAGCAGCAATGTTTTGGTTAGCTGATGGATATGTTACCTATGCTAGATTAATAGAGGATGATGACAAAAACGATTCTTTAGAACATTTCCAAAAAGCTTTTAAGTGGTTTCAGAAAGCTTCAGAAAATGGTCATTCTGAATCAATGGTTGAGTTAGCTGACCTATATACTCGCGCAGCTAGCGGAATAGAAGTTAATATTAAGAAAGCTTTAGAACTTCGTGAAAAGGCTGCAAAGTTAGGTAATAAGAAAGCAATGCGAAGTCTTTCCGTTATGTATCGTGATGGTATAGGCATTCCTAAAAATACTGATTTAGCTCAAAGTTGGTGGGATAAGTCTGAAAATTAATTTTAGTAAAAGTTAAGGATTTTTTAAGAGTCTAATAAATGAATAGTGTTCATGAAATTATAGAAAAAATACATAATGAATGGGAGATTGAACCTAAGAAAGCTATTCAGAGAGGTATGGAATGTCCCTTTCCTTTGCAGTGTTCATTAAACCTAAAAAGCAAAATTTACCCACAAATACCCCAAGTACTTTTACCAAAAGTATTAGAGGATTTTTATACAGTATCTAATGGTGCAGATCTATTTAAAGATCAAGAATACGGACAATGGGGCTTAAAATTATATTCTATTGAGGAAGTAATTTTTGCTTCAAAAATTTATAAAAGTAATCGTAATAATGACGCTCTTCAAAGTGACTTAATTATTGGAGAATTCTATGGAGATTCAGACTTATTGTTAGTTCGATGTGATCCTAATAGTGATGATTACGGCTCTATTTTTGTTGTCTTACCTATAGATCAAAGACAAGATTGGTACATTGTTGCTAATACTTTTCAAGAATTCATTAGTAAATTTTATGAAACTCAGGGTGATAAATTCTGGGAACCTTAAAATTATATGGCCGATATAAGCTCTATTTAACTTATGTTCACCTAAAATTTATTTAATTATCACAAAAATAGTATCATTTTTTGAATTTCATATAACGCCCATTATGTTAAATGGCTTAATCAAAATAACCTAAAATTGCTAAAGTAGGTATAAAAAAGATCGTAAATACAGCAAAATATTGAATCATCTTTTTATGTCTGAAATCTCGTTTAGAAATGATCAAAAGCAATGCCGAAAAAAAAATGACTATGAATGTCCATGAAATTGAATACATACTTAAATCCTAATTAATTATGTTGAGTTTTAAATTTCTCTAAAATTAACATAATACACCTTATACGAAATGCTTTTAATTTCTCTTGAAAATTCAACGCTGTAGTTTCTAGTTTATAGCCCATCTTCCCCAAGGTGGGTTTTTTTTATGATTTTTCACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTC LN:i:4149 RC:i:248525
S 333 AAAATTACCAATCCTGATCTGTTTGCTACCAATGATCAGATTCAACAAAAAGCCCAGGATTAACCTGGGCTTCGTATAAGGCGTATTATGTTAATTTTAATGGAACTTTATAATTAGACGTTGTTGTTGATATATAGCGAATGCCATCATGGTGGGCACTATATTTTATGTAGATTGAATACCCACAATCTAAATTACACCATTTTTCAGAGTACAGACTTTCTCTTACATCATCTGGGCTTCCCAATAGATCATGAATTTCAACCTTAGACATGTCAAAATTAAGTCCTTTAGGTAGAAGCTCTCTATATTTATTAAATCCTTCAACATTTTGATTATAAAAAAATATTGTTTGAAAAAATAAGTTTTTACCTAACATTGCTGTATTACTAAGATGATCTAAGCAACTAGCTTCATCTCGAAAAAGTAAACAAATACCTTTTTCTTCATTAGCTATATATAAAGTATCTTCATCTAAAGGTTCATTATCCTCATCTAATAAAGGAATATAAAGATTAGGTTCTGTAATTAATTGATTATTTAATAAAAAATAAATCAACTCCTTATCATGATTACTTGTACCCAAAAATTCTAAAAGCTTTTCGGTCATAATATTTTTCTATATTTTCGACTGACAAATTTTAGAAGAAATCCCATGTAAAAGTCATCTACATCGACCCATTTAACATAATAGAGCTTATGCGAAACCACTCTTAATTATTTTAATTAAATCAATTGTTTGTGAATTTTACATAACGCAAAAAAACACCAAAAAGCCGACTTAGAAACAAGTCGGCTGCTTTTTAAGCAAGTTTGTCATATTCTGCCAATAAAATCGGTTAAAAAATGCTCAAAATTTCAATTTTTCCTGATTTCTAAAGGCTCTCTTTTCTGAAATTATCCCCATTTCCTGCGGACAATTCTTGGGATAATTTGAGGATATTTCCTTGTACGTTCTGGCGTACAACAATTCAGGAAAATTGCCCCAATTGGTTGTGCGGTAGATTTTGTGTGTAGGCTTCTTCATTTGAAAATTATATCGCTGAAAAAGCCTTTACAGATAGGTTTGTGCAACAAAGCCTTTCATAAATAAAACTAAAATTTAATGTAAAAGCGTTGAGAATATCCTGCACCATTTTCCCATTTATCAGTCTCTATGTCTAAAGATAGGTTCTTTACTGTTGTCTGTTTATTCGTTTTGTTTTTTATAGCATTTTTTAAAACAGATGGGAGTTCTTGATTAGCAACTTCCTTGTATATGCTTTCAGACATTTTTATAAATTCAGTCTCTGCTGATTTCTTATATTTTGAGTCATTAACATTTAAAACAACTTTCAATTCTTTAATACTACTTTCAGTACCTTCTACATAATAAGCAAGATTATTTGGATTTTCTCCATTTTGAGGATCAGGAGAAATATCAATATATGAAGAAATACAACGATAGGTACCTTCTCCCATTGTGTCTTTCCAGCTAGTCATACCTGGTAAATTCAGCATAGAACATGCTTCACTCACCGATATACTAATGTCTTCTGCTTCTACGTTTCCTGTATCTTTACTTTCTGCTTCTACATTTCCTATATCTTTACTTTCTACTTCCACGCTTTTTTCCTCTACTTCTTTATTCTGAATCGAAGAATCACAAGCTGAAAAAAATAAAGGTAATAACATCACAGCAATAAATTTATTTCTCATAACAATCCTATAAGCTACTAATATTAATTAATTTTTAACTGTAATTTTCCAGTTACCACCAACATCTGCTAGTTCGTATCTATAATTGTAATTATTACACACTAAATGATATCCAACACTTCTAACAAAAGGTAAAGTTGATGTTACAGAATCACACTTATAACCATTTAATTTAACCATGACTGCCATTTGCTTAGCCCCTTGTACTCCTGTAGCTTTATCAATTTTGGCACCCGATTCAATTTTTACTGATTTCATCATCTCTTGTAATACTTTTTCTTGTAATGGAGTTGCAAAAGTATTTACTGAGCATAATGTAGAGATAGCTAATACAATTTTGATAGATAATTTCATGATTATTAAGTAATTAAAAAAATGATATATTATATATAGTTAAAAATTTAGCAATATTTATATTGGTAAGTCCTTTATGGTTCTAGGCGGTTGGCGGGTTTCGGGGGGCTTGTCCCCCTGAACAAGGTCACGATAGGTAACTTTGAAAATAACGCAGTTATTTGAAAAAGTACCTATTGTGTATCCTTGCTTATTTTTGAATAAGCACTAGTTTAATATAATATTTAAATAAATTCCTGTGTAGGTTTGTTTTTGGGTTTTAGTTAGTGAACTATTTTTTATAATTGAACAAGAAAAAGGAAATATAAAAAATATAGCTCAGGTATTTGAATATCACCATTGAAAAACAAGGCTTCACGGTATGGAGCCGCTGTCGCTTTCAAATCGGTATGCTTGGGTTGCGTTACAGTCCAAGTCACAAGAAAGTATTTAGAAGTCTGACTACTTCCCCTGCACACGCATTATCTCTCTACAACACATGCTCGTATCTTGTGCCGATCATCTCGTACATCTATTTACGATTACATGCGCTATTCCCTGCGTCCTTTACCTAGTGTAATACATCAGTGGTAAAGCGGTTTTAGTCTTTAATGCCTGCCGTCGCTTGTCCACCTTCACATTAAAGCCACCAAAGACTAAGCCGATATATAAAAATATCGCAGTACCCATTGATTCGCATAGTCTCTATTTGCCGCCACCTCATTCCCAGCAAAATCATTTTTAATTTTATTTGTTGTCACTGCTTTTTTAGGAGAACTTTCTACTATCTTTTCCCAGTCAAATAGTGCCTCACCACTGTCTATCCATGTAGTAAAATCATCATAAATCATAATATTTTTTATGGCTTCTTCTTTATTTGCAAAAGTAAATGATGATGTAGTTAATAATAATGTCGGGATCAGAAGGTATTTCATAAAAAGATACTCTAAAATTAATTTTAAGTTAGAATCTTATAAAACAATGACTTAATTAAAAGCAATTTTTTTGCTTTAGGGAATAGTGTATGTCTTTAATAAATTGTCCAGAATGCAACCATTCTGTTAGTGATAAAGCCTTAGATTGTCCATCTTGTGGGGCTAGATTAAGGAAACCTAAAAGAGGTTTCTTTGGTAAGATTTTTAAATGGCTCTTTATTCTTTTTAACCTTTTTATGCTAGTTATGACATTTAAATCTTGTGGAAGTGCTTCAGAAATTATTGCTTCTAGCCAAAATGAATATGAACAAGCAGGAGCTACATTAGGCTCTACATTAGGCTTAGGGATGGTGATAATTTTCTGGGCATTAGTAGATGTAATACTTGGATTATTTGTACTATTCACACGACCAAAGCGATAAAAATAGGGAGATATTCTCCTTATTTTTCAAATAGCTAAGCGTAAATTAAGGGCTTTAACAACTTTTATAACAGTGTCAAAACTAGGATTAACATCCCCAGATAGAGCTTTGTAAAGACTTTCTCTTCCTAAGCCAGTTTCTTTTGATAGTTGAGTCATACCTTTAGCTTTAGCAATGTTTCCTAGTGCTTTAGCGATAAAGGCTGCATCACCGTTGGATTCATCAATACATGCTTGTAAGTAAGCTTGCATATCCTCTTCTGTTTTGAGGTGCTCTGCACTATCCCACTTACGAAGTTTAATAGCCATTTATAGTTCCTCCTCTAGGTCTTTTGCTAGCTTCAAGGCGAGTTTTATATCTTTACTTTGGGTGGTTTTATCACCGCCTGCTAATAAAATAATAACTCTTTGTCCTTGTTGGCAATAATAAACTCTATAACCAGGACCGAAGAAAAAACGTAGTTCAGAAACACCTTCGCCCACAGGCTCAGTATCTCCAAAATTGCCATCTTCAACCCGATCGATACGAACTTGTATGCGTCTTTTTGCCTGTTGGTCTTTAAGTTTGTCAAACCAATCATCAAAGACTTCGGTGGTGTATATTGAGTACATAAGCAAAATGTATCCTATAGGATACATTTGTGCAAGACATAAACTTGGTTTTTTAGGCCTGAGGCACTGATGTTATTCCCAAGTAGAAAAGTCCTACCTAATAACTAAATAGAAATGTCCTATACAGGCATTTCCGGCCAGTGAGCCTATCAAAGTCCTCATGCATGAAGCCAAAGAAGAAATAATTTATTATTGAGTTTCGCTAAAATTAACATAATACACCTTATACGAAATGCGTTATAATTATCTATAGAAATCAATGCTGTAGTTTCTAGTTTATAGCCCATCTTCCCCAAGGTGGGTTTTTTTTATGATTTTACACGTTCCACGCTTATTTATTGAGCAATGTTCCACGGTTTTATTTAGCCTAAGTTAACCATATGGAAGAAAAAAACGACTC LN:i:4399 RC:i:191002
S 6 GAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTT LN:i:89 RC:i:9779
L 6 + 277 - 81M
L 6 + 280 + 81M
L 232 + 277 + 81M
L 280 + 232 - 81M
L 282 + 6 + 81M
L 283 + 6 + 81M
L 289 + 282 + 81M
L 289 + 283 + 81M
L 297 - 232 + 81M
L 297 + 289 + 81M
L 333 - 232 + 81M
L 333 + 289 + 81M

View File

@ -28,7 +28,7 @@ from Bio.SeqRecord import SeqRecord
# warnings to stdout and verifying via the print-and-compare check. However,
# there was some frustrating cross-platform inconsistency I couldn't resolve.
possible_unknown_seq_formats = {"embl", "genbank", "gb", "imgt", "qual"}
possible_unknown_seq_formats = {"embl", "genbank", "gb", "imgt", "qual", "gfa1", "gfa2"}
# List of formats including alignment only file formats we can read AND write.
# The list is initially hard coded to preserve the original order of the unit
@ -1844,6 +1844,61 @@ class TestSeqIO(SeqIOTestBaseClass):
messages,
)
def test_gfa1(self):
sequences = [
"GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCT...ACATTAT",
"TATAATAAACACCCTCACCACTACAATCTTCCTAGGAACA...CTACTCT",
"GGGGTAAATGATGGGTTGGGCCAAGGGGTTAATTAGTACG...TATTAAG",
"ACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGC...TCTGAGC",
"ATTCTACCACTCCAGCCTAGCCCCCACCCCTCAACTTGGA...TATAAAC",
"CTTTTACCACTCCAGCCTAGCCCCTACCCCCCAATTAGGA...AGCCCAA",
"TTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCC...AAAGACC",
"ACATCATCGAAACCGCAAACATATCATACACAAACGCCTG...CACGATG",
]
ids = [
"MTh0",
"MTh4001",
"MTo3426",
"MTh4502",
"MTo8961",
"MTh9505",
"MTh13014",
"MTh13516",
]
names = ids
lengths = [4001, 501, 501, 5003, 502, 3509, 502, 3053]
molecule_types = {
"embl": "DNA",
"genbank": "DNA",
"imgt": "DNA",
"seqxml": "DNA",
"nexus": "DNA",
}
alignment = None
messages = {
"fastq": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
"fastq-illumina": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
"fastq-solexa": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
"nib": "More than one sequence found",
"phd": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
"qual": "No suitable quality scores found in letter_annotations of SeqRecord (id=MTh13516).",
"sff": "Missing SFF flow information",
"xdna": "More than one sequence found",
}
self.perform_test(
"gfa1",
False,
"GFA/seq.gfa",
8,
ids,
names,
sequences,
lengths,
alignment,
messages,
molecule_types=molecule_types,
)
def test_nexus1(self):
sequences = [
"A-C-G-Tc-gtgtgtgctct-t-t------ac-gtgtgtgctct-t-t",

77
Tests/test_SeqIO_Gfa.py Normal file
View File

@ -0,0 +1,77 @@
"""Tests for SeqIO GFA module."""
import unittest
from Bio import BiopythonWarning
from Bio import SeqIO
class TestRead(unittest.TestCase):
def test_read_GFA1(self):
"""Test parsing valid GFA 1.x files."""
records = list(SeqIO.parse("GFA/seq.gfa", "gfa1"))
self.assertEqual(len(records), 8)
self.assertEqual(records[6].id, "MTh13014")
self.assertEqual(
records[6].seq,
"TTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGACC",
)
self.assertEqual(records[0].annotations["SN"], ("Z", "MT_human"))
self.assertEqual(records[0].annotations["SO"], ("i", "0"))
records = list(SeqIO.parse("GFA/seq_with_len.gfa", "gfa1"))
self.assertEqual(len(records), 9)
self.assertEqual(
records[8].seq,
"GAAAAATTGCCCTTGGTTTTCGCTTCGCTCAAACTCTATTGAACTTCGCTTTCGCTCAGTTCGTCGGGGCAATTTTTTGGTTAATACTT",
)
records = list(SeqIO.parse("GFA/fake_with_checksum.gfa", "gfa1"))
self.assertEqual(len(records), 1)
self.assertEqual(records[0].seq, "AAA")
records = list(SeqIO.parse("GFA/no_seq.gfa", "gfa1"))
self.assertEqual(len(records), 9)
self.assertEqual(len(records[0]), 528)
def test_read_GFA2(self):
"""Test parsing valid GFA 2.0 files."""
records = list(SeqIO.parse("GFA/fake_gfa2.gfa", "gfa2"))
self.assertEqual(len(records), 1)
self.assertEqual(records[0].seq, "AAA")
class TestCorrupt(unittest.TestCase):
def test_corrupt_gfa2(self):
"""Check a GFA 1.x file does not parse in GFA 2."""
with self.assertRaises(ValueError):
list(SeqIO.parse("GFA/seq.gfa", "gfa2"))
def test_corrupt_segment_fields(self):
"""Check a GFA file with invalid fields on a segment line."""
with self.assertRaises(ValueError):
list(SeqIO.parse("GFA/corrupt_segment_fields.gfa", "gfa1"))
def test_corrupt_len(self):
"""Check a GFA file with an incorrect length."""
with self.assertWarns(BiopythonWarning):
list(SeqIO.parse("GFA/corrupt_len.gfa", "gfa1"))
def test_corrupt_checksum(self):
"""Check a GFA file with an incorrect checksum."""
with self.assertWarns(BiopythonWarning):
list(SeqIO.parse("GFA/corrupt_checksum.gfa", "gfa1"))
def test_corrupt_tag_name(self):
"""Check a GFA file with an invalid tag name."""
with self.assertWarns(BiopythonWarning):
list(SeqIO.parse("GFA/corrupt_tag_name.gfa", "gfa1"))
def test_corrupt_tag_type(self):
"""Check a GFA file with an incorrect tag type."""
with self.assertWarns(BiopythonWarning):
list(SeqIO.parse("GFA/corrupt_tag_type.gfa", "gfa1"))
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)