mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
Don't allow comments at the beginning of Fasta files (#4780)
* update * rename fasta file * update * warning * add FastaBlastIterator, FastaPearsonIterator * revert fmt * update docs * spelling * more spelling --------- Co-authored-by: Michiel de Hoon <mdehoon@madpc2s-MacBook-Pro.local>
This commit is contained in:
@ -15,6 +15,8 @@ You are expected to use this module via the Bio.SeqIO functions.
|
||||
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
from Bio import BiopythonDeprecationWarning
|
||||
|
||||
|
||||
from .Interfaces import _clean
|
||||
from .Interfaces import _get_seq_string
|
||||
@ -22,6 +24,8 @@ from .Interfaces import _TextIOSource
|
||||
from .Interfaces import SequenceIterator
|
||||
from .Interfaces import SequenceWriter
|
||||
|
||||
import warnings
|
||||
|
||||
|
||||
def SimpleFastaParser(handle):
|
||||
"""Iterate over Fasta records as string tuples.
|
||||
@ -45,14 +49,44 @@ def SimpleFastaParser(handle):
|
||||
('delta', 'CGCGC')
|
||||
|
||||
"""
|
||||
# Skip any text before the first record (e.g. blank lines, comments)
|
||||
for line in handle:
|
||||
if line[0] == ">":
|
||||
title = line[1:].rstrip()
|
||||
break
|
||||
else:
|
||||
# no break encountered - probably an empty file
|
||||
try:
|
||||
line = next(handle)
|
||||
except StopIteration:
|
||||
return
|
||||
if not line.startswith(">"):
|
||||
warnings.warn(
|
||||
"Previously, the FASTA parser silently ignored comments at the "
|
||||
"beginning of the FASTA file (before the first sequence).\n"
|
||||
"\n"
|
||||
"Nowadays, the FASTA file format is usually understood not to "
|
||||
"have any such comments, and most software packages do not allow "
|
||||
"them. Therefore, the use of comments at the beginning of a FASTA "
|
||||
"file is now deprecated in Biopython.\n"
|
||||
"\n"
|
||||
"In a future Biopython release, this deprecation warning will be "
|
||||
"replaced by a ValueError. To avoid this, there are three "
|
||||
"options:\n"
|
||||
"\n"
|
||||
"(1) Modify your FASTA file to remove such comments at the "
|
||||
"beginning of the file.\n"
|
||||
"\n"
|
||||
"(2) Use SeqIO.parse with the 'fasta-pearson' format instead of "
|
||||
"'fasta'. This format is consistent with the FASTA format defined "
|
||||
"by William Pearson's FASTA aligner software. Thie format allows "
|
||||
"for comments before the first sequence; lines starting with the "
|
||||
"';' character anywhere in the file are also regarded as comment "
|
||||
"lines and are ignored.\n"
|
||||
"\n"
|
||||
"(3) Use the 'fasta-blast' format. This format regards any lines "
|
||||
"starting with '!', '#', or ';' as comment lines. The "
|
||||
"'fasta-blast' format may be safer than the 'fasta-pearson' "
|
||||
"format, as it explicitly indicates which lines are comments. ",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
for line in handle:
|
||||
if line.startswith(">"):
|
||||
break
|
||||
title = line[1:].rstrip()
|
||||
|
||||
# Main logic
|
||||
# Note, remove trailing whitespace, and any internal spaces
|
||||
@ -137,7 +171,7 @@ def FastaTwoLineParser(handle):
|
||||
|
||||
|
||||
class FastaIterator(SequenceIterator):
|
||||
"""Parser for Fasta files."""
|
||||
"""Parser for plain Fasta files without comments."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -150,6 +184,9 @@ class FastaIterator(SequenceIterator):
|
||||
- source - input stream opened in text mode, or a path to a file
|
||||
- alphabet - optional alphabet, not used. Leave as None.
|
||||
|
||||
This parser expects a plain Fasta format without comments or header
|
||||
lines.
|
||||
|
||||
By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
|
||||
with no custom handling of the title lines:
|
||||
|
||||
@ -241,6 +278,209 @@ class FastaTwoLineIterator(SequenceIterator):
|
||||
)
|
||||
|
||||
|
||||
class FastaBlastIterator(SequenceIterator):
|
||||
"""Parser for Fasta files, allowing for comments as in BLAST."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
source: _TextIOSource,
|
||||
alphabet: None = None,
|
||||
) -> None:
|
||||
"""Iterate over Fasta records as SeqRecord objects.
|
||||
|
||||
Arguments:
|
||||
- source - input stream opened in text mode, or a path to a file
|
||||
- alphabet - optional alphabet, not used. Leave as None.
|
||||
|
||||
This parser expects the data to be in FASTA format. As in BLAST, lines
|
||||
starting with '#', '!', or ';' are interpreted as comments and ignored.
|
||||
|
||||
This iterator acts like calling Bio.SeqIO.parse(handle, "fasta-blast")
|
||||
with no custom handling of the title lines:
|
||||
|
||||
>>> with open("Fasta/dups.fasta") as handle:
|
||||
... for record in FastaIterator(handle):
|
||||
... print(record.id)
|
||||
...
|
||||
alpha
|
||||
beta
|
||||
gamma
|
||||
alpha
|
||||
delta
|
||||
|
||||
If you want to modify the records before writing, for example to change
|
||||
the ID of each record, you can use a generator function as follows:
|
||||
|
||||
>>> def modify_records(records):
|
||||
... for record in records:
|
||||
... record.id = record.id.upper()
|
||||
... yield record
|
||||
...
|
||||
>>> with open('Fasta/dups.fasta') as handle:
|
||||
... for record in modify_records(FastaIterator(handle)):
|
||||
... print(record.id)
|
||||
...
|
||||
ALPHA
|
||||
BETA
|
||||
GAMMA
|
||||
ALPHA
|
||||
DELTA
|
||||
|
||||
"""
|
||||
if alphabet is not None:
|
||||
raise ValueError("The alphabet argument is no longer supported")
|
||||
super().__init__(source, mode="t", fmt="FASTA")
|
||||
|
||||
def parse(self, handle):
|
||||
"""Start parsing the file, and return a SeqRecord generator."""
|
||||
records = self.iterate(handle)
|
||||
return records
|
||||
|
||||
def iterate(self, handle):
|
||||
"""Parse the file and generate SeqRecord objects."""
|
||||
for line in handle:
|
||||
if line[0] not in "#!;":
|
||||
break
|
||||
else:
|
||||
return
|
||||
if not line.startswith(">"):
|
||||
raise ValueError(
|
||||
"Expected FASTA record starting with '>' character.\n"
|
||||
"If this line is a comment, please use '#', '!', or ';' as "
|
||||
"the first character, or use the 'fasta-pearson' format for "
|
||||
"parsing.\n"
|
||||
f"Got: '{line}'"
|
||||
)
|
||||
title = line[1:].rstrip()
|
||||
lines = []
|
||||
for line in handle:
|
||||
# Main logic
|
||||
# Note, remove trailing whitespace, and any internal spaces
|
||||
# (and any embedded \r which are possible in mangled files
|
||||
# when not opened in universal read lines mode)
|
||||
if line[0] in "#!;":
|
||||
pass
|
||||
elif line[0] == ">":
|
||||
try:
|
||||
first_word = title.split(None, 1)[0]
|
||||
except IndexError:
|
||||
first_word = ""
|
||||
sequence = "".join(lines).replace(" ", "").replace("\r", "")
|
||||
yield SeqRecord(
|
||||
Seq(sequence), id=first_word, name=first_word, description=title
|
||||
)
|
||||
lines = []
|
||||
title = line[1:].rstrip()
|
||||
else:
|
||||
lines.append(line.rstrip())
|
||||
try:
|
||||
first_word = title.split(None, 1)[0]
|
||||
except IndexError:
|
||||
first_word = ""
|
||||
sequence = "".join(lines).replace(" ", "").replace("\r", "")
|
||||
yield SeqRecord(
|
||||
Seq(sequence), id=first_word, name=first_word, description=title
|
||||
)
|
||||
|
||||
|
||||
class FastaPearsonIterator(SequenceIterator):
|
||||
"""Parser for Fasta files, allowing for comments as in the FASTA aligner."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
source: _TextIOSource,
|
||||
alphabet: None = None,
|
||||
) -> None:
|
||||
"""Iterate over Fasta records as SeqRecord objects.
|
||||
|
||||
Arguments:
|
||||
- source - input stream opened in text mode, or a path to a file
|
||||
- alphabet - optional alphabet, not used. Leave as None.
|
||||
|
||||
This parser expects a Fasta format allowing for a header (before the
|
||||
first sequence record) and comments (lines starting with ';') as in
|
||||
William Pearson's FASTA aligner software.
|
||||
|
||||
This iterator acts as calling Bio.SeqIO.parse(handle, "fasta-pearson")
|
||||
with no custom handling of the title lines:
|
||||
|
||||
>>> with open("Fasta/dups.fasta") as handle:
|
||||
... for record in FastaIterator(handle):
|
||||
... print(record.id)
|
||||
...
|
||||
alpha
|
||||
beta
|
||||
gamma
|
||||
alpha
|
||||
delta
|
||||
|
||||
If you want to modify the records before writing, for example to change
|
||||
the ID of each record, you can use a generator function as follows:
|
||||
|
||||
>>> def modify_records(records):
|
||||
... for record in records:
|
||||
... record.id = record.id.upper()
|
||||
... yield record
|
||||
...
|
||||
>>> with open('Fasta/dups.fasta') as handle:
|
||||
... for record in modify_records(FastaIterator(handle)):
|
||||
... print(record.id)
|
||||
...
|
||||
ALPHA
|
||||
BETA
|
||||
GAMMA
|
||||
ALPHA
|
||||
DELTA
|
||||
|
||||
"""
|
||||
if alphabet is not None:
|
||||
raise ValueError("The alphabet argument is no longer supported")
|
||||
super().__init__(source, mode="t", fmt="Fasta")
|
||||
|
||||
def parse(self, handle):
|
||||
"""Start parsing the file, and return a SeqRecord generator."""
|
||||
records = self.iterate(handle)
|
||||
return records
|
||||
|
||||
def iterate(self, handle):
|
||||
"""Parse the file and generate SeqRecord objects."""
|
||||
for line in handle:
|
||||
if line.startswith(">"):
|
||||
break
|
||||
else:
|
||||
return
|
||||
title = line[1:].rstrip()
|
||||
lines = []
|
||||
for line in handle:
|
||||
# Main logic
|
||||
# Note, remove trailing whitespace, and any internal spaces
|
||||
# (and any embedded \r which are possible in mangled files
|
||||
# when not opened in universal read lines mode)
|
||||
if line[0] == ";":
|
||||
pass
|
||||
elif line[0] == ">":
|
||||
try:
|
||||
first_word = title.split(None, 1)[0]
|
||||
except IndexError:
|
||||
first_word = ""
|
||||
sequence = "".join(lines).replace(" ", "").replace("\r", "")
|
||||
yield SeqRecord(
|
||||
Seq(sequence), id=first_word, name=first_word, description=title
|
||||
)
|
||||
lines = []
|
||||
title = line[1:].rstrip()
|
||||
else:
|
||||
lines.append(line.rstrip())
|
||||
try:
|
||||
first_word = title.split(None, 1)[0]
|
||||
except IndexError:
|
||||
first_word = ""
|
||||
sequence = "".join(lines).replace(" ", "").replace("\r", "")
|
||||
yield SeqRecord(
|
||||
Seq(sequence), id=first_word, name=first_word, description=title
|
||||
)
|
||||
|
||||
|
||||
class FastaWriter(SequenceWriter):
|
||||
"""Class to write Fasta format files (OBSOLETE).
|
||||
|
||||
|
@ -420,6 +420,8 @@ _FormatToIterator = {
|
||||
"ace": AceIO.AceIterator,
|
||||
"fasta": FastaIO.FastaIterator,
|
||||
"fasta-2line": FastaIO.FastaTwoLineIterator,
|
||||
"fasta-blast": FastaIO.FastaBlastIterator,
|
||||
"fasta-pearson": FastaIO.FastaPearsonIterator,
|
||||
"ig": IgIO.IgIterator,
|
||||
"embl": InsdcIO.EmblIterator,
|
||||
"embl-cds": InsdcIO.EmblCdsFeatureIterator,
|
||||
|
@ -60,6 +60,22 @@ was deprecated as of Release 1.70.
|
||||
Biopython modules, methods, functions
|
||||
=====================================
|
||||
|
||||
Bio.SeqIO.FastaIO
|
||||
-----------------
|
||||
The ``SimpleFastaParser`` (which is used by Bio.SeqIO.parse if
|
||||
``format='fasta'`` or ``format='fasta-2line'``) interprets lines before the
|
||||
first line starting with '>' as comments and skips them. To be consistent with
|
||||
the most common interpretation of the FASTA file format, the use of such
|
||||
comment lines at the beginning of the FASTA file was deprecated in Biopython
|
||||
Release 1.85.
|
||||
As an alternative, you can use ``format='fasta-pearson'`` to specify the FASTA
|
||||
file format as defined by William Pearson's FASTA aligner program, allowing for
|
||||
comment lines at the top of the FASTA file (lines anywhere in the file starting
|
||||
by ';' are also regarded as comment lines and skipped).
|
||||
Another option is to use ``format='fasta-blast'``; this follows the FASTA file
|
||||
format accepted by BLAST, treating any lines starting with '#', ';', or '!' as
|
||||
comment lines and ignoring them.
|
||||
|
||||
Bio.Entrez
|
||||
----------
|
||||
The ``egquery`` function wrapping the NCBI EGQuery (Entrez Global Query)
|
||||
|
@ -10,11 +10,11 @@ tools. These are for tested in Bio.SeqIO and Bio.AlignIO
|
||||
(where the format is called "fasta") as well as other older
|
||||
parts of Biopython such as the Bio.Fasta module.
|
||||
|
||||
ID Description
|
||||
f001 1 protein sequence
|
||||
f002 3 DNA sequences
|
||||
f003 2 proteins, with comments
|
||||
fa01 fasta alignment
|
||||
ID Description
|
||||
f001 1 protein sequence
|
||||
f002 3 DNA sequences
|
||||
f003.fa 2 proteins, with comments
|
||||
fa01 fasta alignment
|
||||
|
||||
The following are example "machine readable" pairwise alignment
|
||||
output files from the FASTA tools when using the -m 10 command
|
||||
|
7
Tests/Fasta/aster_blast.pro
Normal file
7
Tests/Fasta/aster_blast.pro
Normal file
@ -0,0 +1,7 @@
|
||||
# Some header
|
||||
>gi|3298468|dbj|BAA31520.1| SAMIPF
|
||||
#Some comment here
|
||||
GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
|
||||
;Another comment here
|
||||
MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
|
||||
!One more comment
|
7
Tests/Fasta/aster_pearson.pro
Normal file
7
Tests/Fasta/aster_pearson.pro
Normal file
@ -0,0 +1,7 @@
|
||||
Some header
|
||||
>gi|3298468|dbj|BAA31520.1| SAMIPF
|
||||
;Some comment here
|
||||
GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
|
||||
;Another comment here
|
||||
MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
|
||||
;One more comment
|
@ -1,10 +0,0 @@
|
||||
# This file has comments in it
|
||||
>gi|3318709|pdb|1A91|
|
||||
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
|
||||
GLYVMFAVA
|
||||
# I don't really know if FASTA format is supposed to support this
|
||||
# but we might be able to without many problems
|
||||
>gi|whatever|whatever
|
||||
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG
|
||||
# More comments at the end of the file
|
||||
# Man, this file has more comments than sequence
|
5
Tests/Fasta/f003.fa
Normal file
5
Tests/Fasta/f003.fa
Normal file
@ -0,0 +1,5 @@
|
||||
>gi|3318709|pdb|1A91|
|
||||
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
|
||||
GLYVMFAVA
|
||||
>gi|whatever|whatever
|
||||
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG
|
@ -9,10 +9,11 @@ import unittest
|
||||
from io import StringIO
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqIO.FastaIO import FastaIterator
|
||||
from Bio.SeqIO.FastaIO import FastaTwoLineParser
|
||||
from Bio.SeqIO.FastaIO import SimpleFastaParser
|
||||
|
||||
from Bio import BiopythonDeprecationWarning
|
||||
|
||||
|
||||
def title_to_ids(title):
|
||||
"""Convert a FASTA title line into the id, name, and description.
|
||||
@ -192,6 +193,45 @@ class TestSimpleFastaParsers(unittest.TestCase):
|
||||
list(FastaTwoLineParser(handle))
|
||||
|
||||
|
||||
class TestFastaWithComments(unittest.TestCase):
|
||||
"""Test FastaBlastIterator and FastaPearsonIterator."""
|
||||
|
||||
expected = SeqIO.read("Fasta/aster.pro", "fasta")
|
||||
|
||||
def test_fasta_blast(self):
|
||||
"""Test FastaBlastIterator."""
|
||||
|
||||
expected = self.expected
|
||||
|
||||
record = SeqIO.read("Fasta/aster_blast.pro", "fasta-blast")
|
||||
self.assertEqual(expected.id, record.id)
|
||||
self.assertEqual(expected.name, record.name)
|
||||
self.assertEqual(expected.description, record.description)
|
||||
self.assertEqual(expected.seq, record.seq)
|
||||
|
||||
def test_fasta_pearson(self):
|
||||
"""Test FastaPearsonIterator."""
|
||||
|
||||
expected = self.expected
|
||||
|
||||
record = SeqIO.read("Fasta/aster_pearson.pro", "fasta-pearson")
|
||||
self.assertEqual(expected.id, record.id)
|
||||
self.assertEqual(expected.name, record.name)
|
||||
self.assertEqual(expected.description, record.description)
|
||||
self.assertEqual(expected.seq, record.seq)
|
||||
|
||||
def test_valueerrors(self):
|
||||
"""Test if ValueErrors are raised if comments are found unexpectedly."""
|
||||
|
||||
self.assertRaises(
|
||||
ValueError, SeqIO.read, "Fasta/aster_pearson.pro", "fasta-blast"
|
||||
)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
record = SeqIO.read("Fasta/aster_pearson.pro", "fasta")
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
record = SeqIO.read("Fasta/aster_blast.pro", "fasta")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = unittest.TextTestRunner(verbosity=2)
|
||||
unittest.main(testRunner=runner)
|
||||
|
@ -965,7 +965,7 @@ class StringMethodTests(unittest.TestCase):
|
||||
|
||||
def test_join_Seq_with_file(self):
|
||||
"""Checks if Seq join correctly concatenates sequence from a file with the spacer."""
|
||||
filename = "Fasta/f003"
|
||||
filename = "Fasta/f003.fa"
|
||||
seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
|
||||
seqlist_as_strings = [str(_) for _ in seqlist]
|
||||
|
||||
@ -1005,7 +1005,7 @@ class StringMethodTests(unittest.TestCase):
|
||||
|
||||
def test_join_MutableSeq_with_file(self):
|
||||
"""Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
|
||||
filename = "Fasta/f003"
|
||||
filename = "Fasta/f003.fa"
|
||||
seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
|
||||
seqlist_as_strings = [str(_) for _ in seqlist]
|
||||
|
||||
|
Reference in New Issue
Block a user