Don't allow comments at the beginning of Fasta files (#4780)

* update * rename fasta file * update * warning * add FastaBlastIterator, FastaPearsonIterator * revert fmt * update docs * spelling * more spelling --------- Co-authored-by: Michiel de Hoon <mdehoon@madpc2s-MacBook-Pro.local>
2025-10-20 21:53:47 +08:00 · 2024-09-06 02:44:40 +09:00
parent 69b466f038
commit 378c4c124e
10 changed files with 333 additions and 26 deletions
--- a/Bio/SeqIO/FastaIO.py
+++ b/Bio/SeqIO/FastaIO.py
@ -15,6 +15,8 @@ You are expected to use this module via the Bio.SeqIO functions.

 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
+from Bio import BiopythonDeprecationWarning
+

 from .Interfaces import _clean
 from .Interfaces import _get_seq_string
@ -22,6 +24,8 @@ from .Interfaces import _TextIOSource
 from .Interfaces import SequenceIterator
 from .Interfaces import SequenceWriter

+import warnings
+

 def SimpleFastaParser(handle):
    """Iterate over Fasta records as string tuples.
@ -45,14 +49,44 @@ def SimpleFastaParser(handle):
    ('delta', 'CGCGC')

    """
-    # Skip any text before the first record (e.g. blank lines, comments)
-    for line in handle:
-        if line[0] == ">":
-            title = line[1:].rstrip()
-            break
-    else:
-        # no break encountered - probably an empty file
+    try:
+        line = next(handle)
+    except StopIteration:
        return
+    if not line.startswith(">"):
+        warnings.warn(
+            "Previously, the FASTA parser silently ignored comments at the "
+            "beginning of the FASTA file (before the first sequence).\n"
+            "\n"
+            "Nowadays, the FASTA file format is usually understood not to "
+            "have any such comments, and most software packages do not allow "
+            "them. Therefore, the use of comments at the beginning of a FASTA "
+            "file is now deprecated in Biopython.\n"
+            "\n"
+            "In a future Biopython release, this deprecation warning will be "
+            "replaced by a ValueError. To avoid this, there are three "
+            "options:\n"
+            "\n"
+            "(1) Modify your FASTA file to remove such comments at the "
+            "beginning of the file.\n"
+            "\n"
+            "(2) Use SeqIO.parse with the 'fasta-pearson' format instead of "
+            "'fasta'. This format is consistent with the FASTA format defined "
+            "by William Pearson's FASTA aligner software. Thie format allows "
+            "for comments before the first sequence; lines starting with the "
+            "';' character anywhere in the file are also regarded as comment "
+            "lines and are ignored.\n"
+            "\n"
+            "(3) Use the 'fasta-blast' format. This format regards any lines "
+            "starting with '!', '#', or ';' as comment lines. The "
+            "'fasta-blast' format may be safer than the 'fasta-pearson' "
+            "format, as it explicitly indicates which lines are comments. ",
+            BiopythonDeprecationWarning,
+        )
+        for line in handle:
+            if line.startswith(">"):
+                break
+    title = line[1:].rstrip()

    # Main logic
    # Note, remove trailing whitespace, and any internal spaces
@ -137,7 +171,7 @@ def FastaTwoLineParser(handle):


 class FastaIterator(SequenceIterator):
-    """Parser for Fasta files."""
+    """Parser for plain Fasta files without comments."""

    def __init__(
        self,
@ -150,6 +184,9 @@ class FastaIterator(SequenceIterator):
         - source - input stream opened in text mode, or a path to a file
         - alphabet - optional alphabet, not used. Leave as None.

+        This parser expects a plain Fasta format without comments or header
+        lines.
+
        By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
        with no custom handling of the title lines:

@ -241,6 +278,209 @@ class FastaTwoLineIterator(SequenceIterator):
            )


+class FastaBlastIterator(SequenceIterator):
+    """Parser for Fasta files, allowing for comments as in BLAST."""
+
+    def __init__(
+        self,
+        source: _TextIOSource,
+        alphabet: None = None,
+    ) -> None:
+        """Iterate over Fasta records as SeqRecord objects.
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+         - alphabet - optional alphabet, not used. Leave as None.
+
+        This parser expects the data to be in FASTA format. As in BLAST, lines
+        starting with '#', '!', or ';' are interpreted as comments and ignored.
+
+        This iterator acts like calling Bio.SeqIO.parse(handle, "fasta-blast")
+        with no custom handling of the title lines:
+
+        >>> with open("Fasta/dups.fasta") as handle:
+        ...     for record in FastaIterator(handle):
+        ...         print(record.id)
+        ...
+        alpha
+        beta
+        gamma
+        alpha
+        delta
+
+        If you want to modify the records before writing, for example to change
+        the ID of each record, you can use a generator function as follows:
+
+        >>> def modify_records(records):
+        ...     for record in records:
+        ...         record.id = record.id.upper()
+        ...         yield record
+        ...
+        >>> with open('Fasta/dups.fasta') as handle:
+        ...     for record in modify_records(FastaIterator(handle)):
+        ...         print(record.id)
+        ...
+        ALPHA
+        BETA
+        GAMMA
+        ALPHA
+        DELTA
+
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        super().__init__(source, mode="t", fmt="FASTA")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        for line in handle:
+            if line[0] not in "#!;":
+                break
+        else:
+            return
+        if not line.startswith(">"):
+            raise ValueError(
+                "Expected FASTA record starting with '>' character.\n"
+                "If this line is a comment, please use '#', '!', or ';' as "
+                "the first character, or use the 'fasta-pearson' format for "
+                "parsing.\n"
+                f"Got: '{line}'"
+            )
+        title = line[1:].rstrip()
+        lines = []
+        for line in handle:
+            # Main logic
+            # Note, remove trailing whitespace, and any internal spaces
+            # (and any embedded \r which are possible in mangled files
+            # when not opened in universal read lines mode)
+            if line[0] in "#!;":
+                pass
+            elif line[0] == ">":
+                try:
+                    first_word = title.split(None, 1)[0]
+                except IndexError:
+                    first_word = ""
+                sequence = "".join(lines).replace(" ", "").replace("\r", "")
+                yield SeqRecord(
+                    Seq(sequence), id=first_word, name=first_word, description=title
+                )
+                lines = []
+                title = line[1:].rstrip()
+            else:
+                lines.append(line.rstrip())
+        try:
+            first_word = title.split(None, 1)[0]
+        except IndexError:
+            first_word = ""
+        sequence = "".join(lines).replace(" ", "").replace("\r", "")
+        yield SeqRecord(
+            Seq(sequence), id=first_word, name=first_word, description=title
+        )
+
+
+class FastaPearsonIterator(SequenceIterator):
+    """Parser for Fasta files, allowing for comments as in the FASTA aligner."""
+
+    def __init__(
+        self,
+        source: _TextIOSource,
+        alphabet: None = None,
+    ) -> None:
+        """Iterate over Fasta records as SeqRecord objects.
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+         - alphabet - optional alphabet, not used. Leave as None.
+
+        This parser expects a Fasta format allowing for a header (before the
+        first sequence record) and comments (lines starting with ';') as in
+        William Pearson's FASTA aligner software.
+
+        This iterator acts as calling Bio.SeqIO.parse(handle, "fasta-pearson")
+        with no custom handling of the title lines:
+
+        >>> with open("Fasta/dups.fasta") as handle:
+        ...     for record in FastaIterator(handle):
+        ...         print(record.id)
+        ...
+        alpha
+        beta
+        gamma
+        alpha
+        delta
+
+        If you want to modify the records before writing, for example to change
+        the ID of each record, you can use a generator function as follows:
+
+        >>> def modify_records(records):
+        ...     for record in records:
+        ...         record.id = record.id.upper()
+        ...         yield record
+        ...
+        >>> with open('Fasta/dups.fasta') as handle:
+        ...     for record in modify_records(FastaIterator(handle)):
+        ...         print(record.id)
+        ...
+        ALPHA
+        BETA
+        GAMMA
+        ALPHA
+        DELTA
+
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        super().__init__(source, mode="t", fmt="Fasta")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        for line in handle:
+            if line.startswith(">"):
+                break
+        else:
+            return
+        title = line[1:].rstrip()
+        lines = []
+        for line in handle:
+            # Main logic
+            # Note, remove trailing whitespace, and any internal spaces
+            # (and any embedded \r which are possible in mangled files
+            # when not opened in universal read lines mode)
+            if line[0] == ";":
+                pass
+            elif line[0] == ">":
+                try:
+                    first_word = title.split(None, 1)[0]
+                except IndexError:
+                    first_word = ""
+                sequence = "".join(lines).replace(" ", "").replace("\r", "")
+                yield SeqRecord(
+                    Seq(sequence), id=first_word, name=first_word, description=title
+                )
+                lines = []
+                title = line[1:].rstrip()
+            else:
+                lines.append(line.rstrip())
+        try:
+            first_word = title.split(None, 1)[0]
+        except IndexError:
+            first_word = ""
+        sequence = "".join(lines).replace(" ", "").replace("\r", "")
+        yield SeqRecord(
+            Seq(sequence), id=first_word, name=first_word, description=title
+        )
+
+
 class FastaWriter(SequenceWriter):
    """Class to write Fasta format files (OBSOLETE).

--- a/Bio/SeqIO/init.py
+++ b/Bio/SeqIO/init.py
@ -420,6 +420,8 @@ _FormatToIterator = {
    "ace": AceIO.AceIterator,
    "fasta": FastaIO.FastaIterator,
    "fasta-2line": FastaIO.FastaTwoLineIterator,
+    "fasta-blast": FastaIO.FastaBlastIterator,
+    "fasta-pearson": FastaIO.FastaPearsonIterator,
    "ig": IgIO.IgIterator,
    "embl": InsdcIO.EmblIterator,
    "embl-cds": InsdcIO.EmblCdsFeatureIterator,
--- a/DEPRECATED.rst
+++ b/DEPRECATED.rst
@ -60,6 +60,22 @@ was deprecated as of Release 1.70.
 Biopython modules, methods, functions
 =====================================

+Bio.SeqIO.FastaIO
+-----------------
+The ``SimpleFastaParser`` (which is used by Bio.SeqIO.parse if
+``format='fasta'`` or ``format='fasta-2line'``) interprets lines before the
+first line starting with '>' as comments and skips them. To be consistent with
+the most common interpretation of the FASTA file format, the use of such
+comment lines at the beginning of the FASTA file was deprecated in Biopython
+Release 1.85.
+As an alternative, you can use ``format='fasta-pearson'`` to specify the FASTA
+file format as defined by William Pearson's FASTA aligner program, allowing for
+comment lines at the top of the FASTA file (lines anywhere in the file starting
+by ';' are also regarded as comment lines and skipped).
+Another option is to use ``format='fasta-blast'``; this follows the FASTA file
+format accepted by BLAST, treating any lines starting with '#', ';', or '!' as
+comment lines and ignoring them.
+
 Bio.Entrez
 ----------
 The ``egquery`` function wrapping the NCBI EGQuery (Entrez Global Query)
--- a/Tests/Fasta/README
+++ b/Tests/Fasta/README
@ -10,11 +10,11 @@ tools.  These are for tested in Bio.SeqIO and Bio.AlignIO
 (where the format is called "fasta") as well as other older
 parts of Biopython such as the Bio.Fasta module.

-ID     Description
-f001   1 protein sequence
-f002   3 DNA sequences
-f003   2 proteins, with comments
-fa01   fasta alignment
+ID        Description
+f001      1 protein sequence
+f002      3 DNA sequences
+f003.fa   2 proteins, with comments
+fa01      fasta alignment

 The following are example "machine readable" pairwise alignment
 output files from the FASTA tools when using the -m 10 command
--- a/Tests/Fasta/aster_blast.pro
+++ b/Tests/Fasta/aster_blast.pro
@ -0,0 +1,7 @@
+# Some header
+>gi|3298468|dbj|BAA31520.1| SAMIPF
+#Some comment here
+GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
+;Another comment here
+MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
+!One more comment
--- a/Tests/Fasta/aster_pearson.pro
+++ b/Tests/Fasta/aster_pearson.pro
@ -0,0 +1,7 @@
+Some header
+>gi|3298468|dbj|BAA31520.1| SAMIPF
+;Some comment here
+GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
+;Another comment here
+MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
+;One more comment
--- a/Tests/Fasta/f003
+++ b/Tests/Fasta/f003
@ -1,10 +0,0 @@
-# This file has comments in it
->gi|3318709|pdb|1A91|
-MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
-GLYVMFAVA
-# I don't really know if FASTA format is supposed to support this
-# but we might be able to without many problems
->gi|whatever|whatever
-MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG
-# More comments at the end of the file
-# Man, this file has more comments than sequence
--- a/Tests/Fasta/f003.fa
+++ b/Tests/Fasta/f003.fa
@ -0,0 +1,5 @@
+>gi|3318709|pdb|1A91|
+MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
+GLYVMFAVA
+>gi|whatever|whatever
+MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG
--- a/Tests/test_SeqIO_FastaIO.py
+++ b/Tests/test_SeqIO_FastaIO.py
@ -9,10 +9,11 @@ import unittest
 from io import StringIO

 from Bio import SeqIO
-from Bio.SeqIO.FastaIO import FastaIterator
 from Bio.SeqIO.FastaIO import FastaTwoLineParser
 from Bio.SeqIO.FastaIO import SimpleFastaParser

+from Bio import BiopythonDeprecationWarning
+

 def title_to_ids(title):
    """Convert a FASTA title line into the id, name, and description.
@ -192,6 +193,45 @@ class TestSimpleFastaParsers(unittest.TestCase):
                list(FastaTwoLineParser(handle))


+class TestFastaWithComments(unittest.TestCase):
+    """Test FastaBlastIterator and FastaPearsonIterator."""
+
+    expected = SeqIO.read("Fasta/aster.pro", "fasta")
+
+    def test_fasta_blast(self):
+        """Test FastaBlastIterator."""
+
+        expected = self.expected
+
+        record = SeqIO.read("Fasta/aster_blast.pro", "fasta-blast")
+        self.assertEqual(expected.id, record.id)
+        self.assertEqual(expected.name, record.name)
+        self.assertEqual(expected.description, record.description)
+        self.assertEqual(expected.seq, record.seq)
+
+    def test_fasta_pearson(self):
+        """Test FastaPearsonIterator."""
+
+        expected = self.expected
+
+        record = SeqIO.read("Fasta/aster_pearson.pro", "fasta-pearson")
+        self.assertEqual(expected.id, record.id)
+        self.assertEqual(expected.name, record.name)
+        self.assertEqual(expected.description, record.description)
+        self.assertEqual(expected.seq, record.seq)
+
+    def test_valueerrors(self):
+        """Test if ValueErrors are raised if comments are found unexpectedly."""
+
+        self.assertRaises(
+            ValueError, SeqIO.read, "Fasta/aster_pearson.pro", "fasta-blast"
+        )
+        with self.assertWarns(BiopythonDeprecationWarning):
+            record = SeqIO.read("Fasta/aster_pearson.pro", "fasta")
+        with self.assertWarns(BiopythonDeprecationWarning):
+            record = SeqIO.read("Fasta/aster_blast.pro", "fasta")
+
+
 if __name__ == "__main__":
    runner = unittest.TextTestRunner(verbosity=2)
    unittest.main(testRunner=runner)
--- a/Tests/test_Seq_objs.py
+++ b/Tests/test_Seq_objs.py
@ -965,7 +965,7 @@ class StringMethodTests(unittest.TestCase):

    def test_join_Seq_with_file(self):
        """Checks if Seq join correctly concatenates sequence from a file with the spacer."""
-        filename = "Fasta/f003"
+        filename = "Fasta/f003.fa"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

@ -1005,7 +1005,7 @@ class StringMethodTests(unittest.TestCase):

    def test_join_MutableSeq_with_file(self):
        """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
-        filename = "Fasta/f003"
+        filename = "Fasta/f003.fa"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]