Don't allow comments at the beginning of Fasta files (#4780)

* update

* rename fasta file

* update

* warning

* add FastaBlastIterator, FastaPearsonIterator

* revert fmt

* update docs

* spelling

* more spelling

---------

Co-authored-by: Michiel de Hoon <mdehoon@madpc2s-MacBook-Pro.local>
This commit is contained in:
mdehoon
2024-09-06 02:44:40 +09:00
committed by GitHub
parent 69b466f038
commit 378c4c124e
10 changed files with 333 additions and 26 deletions

View File

@ -15,6 +15,8 @@ You are expected to use this module via the Bio.SeqIO functions.
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import BiopythonDeprecationWarning
from .Interfaces import _clean
from .Interfaces import _get_seq_string
@ -22,6 +24,8 @@ from .Interfaces import _TextIOSource
from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter
import warnings
def SimpleFastaParser(handle):
"""Iterate over Fasta records as string tuples.
@ -45,14 +49,44 @@ def SimpleFastaParser(handle):
('delta', 'CGCGC')
"""
# Skip any text before the first record (e.g. blank lines, comments)
for line in handle:
if line[0] == ">":
title = line[1:].rstrip()
break
else:
# no break encountered - probably an empty file
try:
line = next(handle)
except StopIteration:
return
if not line.startswith(">"):
warnings.warn(
"Previously, the FASTA parser silently ignored comments at the "
"beginning of the FASTA file (before the first sequence).\n"
"\n"
"Nowadays, the FASTA file format is usually understood not to "
"have any such comments, and most software packages do not allow "
"them. Therefore, the use of comments at the beginning of a FASTA "
"file is now deprecated in Biopython.\n"
"\n"
"In a future Biopython release, this deprecation warning will be "
"replaced by a ValueError. To avoid this, there are three "
"options:\n"
"\n"
"(1) Modify your FASTA file to remove such comments at the "
"beginning of the file.\n"
"\n"
"(2) Use SeqIO.parse with the 'fasta-pearson' format instead of "
"'fasta'. This format is consistent with the FASTA format defined "
"by William Pearson's FASTA aligner software. Thie format allows "
"for comments before the first sequence; lines starting with the "
"';' character anywhere in the file are also regarded as comment "
"lines and are ignored.\n"
"\n"
"(3) Use the 'fasta-blast' format. This format regards any lines "
"starting with '!', '#', or ';' as comment lines. The "
"'fasta-blast' format may be safer than the 'fasta-pearson' "
"format, as it explicitly indicates which lines are comments. ",
BiopythonDeprecationWarning,
)
for line in handle:
if line.startswith(">"):
break
title = line[1:].rstrip()
# Main logic
# Note, remove trailing whitespace, and any internal spaces
@ -137,7 +171,7 @@ def FastaTwoLineParser(handle):
class FastaIterator(SequenceIterator):
"""Parser for Fasta files."""
"""Parser for plain Fasta files without comments."""
def __init__(
self,
@ -150,6 +184,9 @@ class FastaIterator(SequenceIterator):
- source - input stream opened in text mode, or a path to a file
- alphabet - optional alphabet, not used. Leave as None.
This parser expects a plain Fasta format without comments or header
lines.
By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
with no custom handling of the title lines:
@ -241,6 +278,209 @@ class FastaTwoLineIterator(SequenceIterator):
)
class FastaBlastIterator(SequenceIterator):
"""Parser for Fasta files, allowing for comments as in BLAST."""
def __init__(
self,
source: _TextIOSource,
alphabet: None = None,
) -> None:
"""Iterate over Fasta records as SeqRecord objects.
Arguments:
- source - input stream opened in text mode, or a path to a file
- alphabet - optional alphabet, not used. Leave as None.
This parser expects the data to be in FASTA format. As in BLAST, lines
starting with '#', '!', or ';' are interpreted as comments and ignored.
This iterator acts like calling Bio.SeqIO.parse(handle, "fasta-blast")
with no custom handling of the title lines:
>>> with open("Fasta/dups.fasta") as handle:
... for record in FastaIterator(handle):
... print(record.id)
...
alpha
beta
gamma
alpha
delta
If you want to modify the records before writing, for example to change
the ID of each record, you can use a generator function as follows:
>>> def modify_records(records):
... for record in records:
... record.id = record.id.upper()
... yield record
...
>>> with open('Fasta/dups.fasta') as handle:
... for record in modify_records(FastaIterator(handle)):
... print(record.id)
...
ALPHA
BETA
GAMMA
ALPHA
DELTA
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="t", fmt="FASTA")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
records = self.iterate(handle)
return records
def iterate(self, handle):
"""Parse the file and generate SeqRecord objects."""
for line in handle:
if line[0] not in "#!;":
break
else:
return
if not line.startswith(">"):
raise ValueError(
"Expected FASTA record starting with '>' character.\n"
"If this line is a comment, please use '#', '!', or ';' as "
"the first character, or use the 'fasta-pearson' format for "
"parsing.\n"
f"Got: '{line}'"
)
title = line[1:].rstrip()
lines = []
for line in handle:
# Main logic
# Note, remove trailing whitespace, and any internal spaces
# (and any embedded \r which are possible in mangled files
# when not opened in universal read lines mode)
if line[0] in "#!;":
pass
elif line[0] == ">":
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)
lines = []
title = line[1:].rstrip()
else:
lines.append(line.rstrip())
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)
class FastaPearsonIterator(SequenceIterator):
"""Parser for Fasta files, allowing for comments as in the FASTA aligner."""
def __init__(
self,
source: _TextIOSource,
alphabet: None = None,
) -> None:
"""Iterate over Fasta records as SeqRecord objects.
Arguments:
- source - input stream opened in text mode, or a path to a file
- alphabet - optional alphabet, not used. Leave as None.
This parser expects a Fasta format allowing for a header (before the
first sequence record) and comments (lines starting with ';') as in
William Pearson's FASTA aligner software.
This iterator acts as calling Bio.SeqIO.parse(handle, "fasta-pearson")
with no custom handling of the title lines:
>>> with open("Fasta/dups.fasta") as handle:
... for record in FastaIterator(handle):
... print(record.id)
...
alpha
beta
gamma
alpha
delta
If you want to modify the records before writing, for example to change
the ID of each record, you can use a generator function as follows:
>>> def modify_records(records):
... for record in records:
... record.id = record.id.upper()
... yield record
...
>>> with open('Fasta/dups.fasta') as handle:
... for record in modify_records(FastaIterator(handle)):
... print(record.id)
...
ALPHA
BETA
GAMMA
ALPHA
DELTA
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="t", fmt="Fasta")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
records = self.iterate(handle)
return records
def iterate(self, handle):
"""Parse the file and generate SeqRecord objects."""
for line in handle:
if line.startswith(">"):
break
else:
return
title = line[1:].rstrip()
lines = []
for line in handle:
# Main logic
# Note, remove trailing whitespace, and any internal spaces
# (and any embedded \r which are possible in mangled files
# when not opened in universal read lines mode)
if line[0] == ";":
pass
elif line[0] == ">":
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)
lines = []
title = line[1:].rstrip()
else:
lines.append(line.rstrip())
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)
class FastaWriter(SequenceWriter):
"""Class to write Fasta format files (OBSOLETE).

View File

@ -420,6 +420,8 @@ _FormatToIterator = {
"ace": AceIO.AceIterator,
"fasta": FastaIO.FastaIterator,
"fasta-2line": FastaIO.FastaTwoLineIterator,
"fasta-blast": FastaIO.FastaBlastIterator,
"fasta-pearson": FastaIO.FastaPearsonIterator,
"ig": IgIO.IgIterator,
"embl": InsdcIO.EmblIterator,
"embl-cds": InsdcIO.EmblCdsFeatureIterator,

View File

@ -60,6 +60,22 @@ was deprecated as of Release 1.70.
Biopython modules, methods, functions
=====================================
Bio.SeqIO.FastaIO
-----------------
The ``SimpleFastaParser`` (which is used by Bio.SeqIO.parse if
``format='fasta'`` or ``format='fasta-2line'``) interprets lines before the
first line starting with '>' as comments and skips them. To be consistent with
the most common interpretation of the FASTA file format, the use of such
comment lines at the beginning of the FASTA file was deprecated in Biopython
Release 1.85.
As an alternative, you can use ``format='fasta-pearson'`` to specify the FASTA
file format as defined by William Pearson's FASTA aligner program, allowing for
comment lines at the top of the FASTA file (lines anywhere in the file starting
by ';' are also regarded as comment lines and skipped).
Another option is to use ``format='fasta-blast'``; this follows the FASTA file
format accepted by BLAST, treating any lines starting with '#', ';', or '!' as
comment lines and ignoring them.
Bio.Entrez
----------
The ``egquery`` function wrapping the NCBI EGQuery (Entrez Global Query)

View File

@ -10,11 +10,11 @@ tools. These are for tested in Bio.SeqIO and Bio.AlignIO
(where the format is called "fasta") as well as other older
parts of Biopython such as the Bio.Fasta module.
ID Description
f001 1 protein sequence
f002 3 DNA sequences
f003 2 proteins, with comments
fa01 fasta alignment
ID Description
f001 1 protein sequence
f002 3 DNA sequences
f003.fa 2 proteins, with comments
fa01 fasta alignment
The following are example "machine readable" pairwise alignment
output files from the FASTA tools when using the -m 10 command

View File

@ -0,0 +1,7 @@
# Some header
>gi|3298468|dbj|BAA31520.1| SAMIPF
#Some comment here
GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
;Another comment here
MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
!One more comment

View File

@ -0,0 +1,7 @@
Some header
>gi|3298468|dbj|BAA31520.1| SAMIPF
;Some comment here
GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
;Another comment here
MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
;One more comment

View File

@ -1,10 +0,0 @@
# This file has comments in it
>gi|3318709|pdb|1A91|
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
GLYVMFAVA
# I don't really know if FASTA format is supposed to support this
# but we might be able to without many problems
>gi|whatever|whatever
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG
# More comments at the end of the file
# Man, this file has more comments than sequence

5
Tests/Fasta/f003.fa Normal file
View File

@ -0,0 +1,5 @@
>gi|3318709|pdb|1A91|
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
GLYVMFAVA
>gi|whatever|whatever
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG

View File

@ -9,10 +9,11 @@ import unittest
from io import StringIO
from Bio import SeqIO
from Bio.SeqIO.FastaIO import FastaIterator
from Bio.SeqIO.FastaIO import FastaTwoLineParser
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio import BiopythonDeprecationWarning
def title_to_ids(title):
"""Convert a FASTA title line into the id, name, and description.
@ -192,6 +193,45 @@ class TestSimpleFastaParsers(unittest.TestCase):
list(FastaTwoLineParser(handle))
class TestFastaWithComments(unittest.TestCase):
"""Test FastaBlastIterator and FastaPearsonIterator."""
expected = SeqIO.read("Fasta/aster.pro", "fasta")
def test_fasta_blast(self):
"""Test FastaBlastIterator."""
expected = self.expected
record = SeqIO.read("Fasta/aster_blast.pro", "fasta-blast")
self.assertEqual(expected.id, record.id)
self.assertEqual(expected.name, record.name)
self.assertEqual(expected.description, record.description)
self.assertEqual(expected.seq, record.seq)
def test_fasta_pearson(self):
"""Test FastaPearsonIterator."""
expected = self.expected
record = SeqIO.read("Fasta/aster_pearson.pro", "fasta-pearson")
self.assertEqual(expected.id, record.id)
self.assertEqual(expected.name, record.name)
self.assertEqual(expected.description, record.description)
self.assertEqual(expected.seq, record.seq)
def test_valueerrors(self):
"""Test if ValueErrors are raised if comments are found unexpectedly."""
self.assertRaises(
ValueError, SeqIO.read, "Fasta/aster_pearson.pro", "fasta-blast"
)
with self.assertWarns(BiopythonDeprecationWarning):
record = SeqIO.read("Fasta/aster_pearson.pro", "fasta")
with self.assertWarns(BiopythonDeprecationWarning):
record = SeqIO.read("Fasta/aster_blast.pro", "fasta")
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -965,7 +965,7 @@ class StringMethodTests(unittest.TestCase):
def test_join_Seq_with_file(self):
"""Checks if Seq join correctly concatenates sequence from a file with the spacer."""
filename = "Fasta/f003"
filename = "Fasta/f003.fa"
seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
seqlist_as_strings = [str(_) for _ in seqlist]
@ -1005,7 +1005,7 @@ class StringMethodTests(unittest.TestCase):
def test_join_MutableSeq_with_file(self):
"""Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
filename = "Fasta/f003"
filename = "Fasta/f003.fa"
seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
seqlist_as_strings = [str(_) for _ in seqlist]