mirror of
https://github.com/biopython/biopython.git
synced 2025-10-21 06:23:48 +08:00
* update * rename fasta file * update * warning * add FastaBlastIterator, FastaPearsonIterator * revert fmt * update docs * spelling * more spelling --------- Co-authored-by: Michiel de Hoon <mdehoon@madpc2s-MacBook-Pro.local>
238 lines
8.9 KiB
Python
238 lines
8.9 KiB
Python
# Copyright 2009-2013 by Peter Cock. All rights reserved.
|
|
# Parts copyright 1999 by Jeffrey Chang. All rights reserved.
|
|
# This code is part of the Biopython distribution and governed by its
|
|
# license. Please see the LICENSE file that should have been included
|
|
# as part of this package.
|
|
"""Tests for Bio.SeqIO.FastaIO module."""
|
|
|
|
import unittest
|
|
from io import StringIO
|
|
|
|
from Bio import SeqIO
|
|
from Bio.SeqIO.FastaIO import FastaTwoLineParser
|
|
from Bio.SeqIO.FastaIO import SimpleFastaParser
|
|
|
|
from Bio import BiopythonDeprecationWarning
|
|
|
|
|
|
def title_to_ids(title):
|
|
"""Convert a FASTA title line into the id, name, and description.
|
|
|
|
This is just a quick-n-dirty implementation, and is definitely not meant
|
|
to handle every FASTA title line case.
|
|
"""
|
|
# first split the id information from the description
|
|
# the first item is the id info block, the rest is the description
|
|
all_info = title.split(" ")
|
|
id_info = all_info[0]
|
|
rest = all_info[1:]
|
|
descr = " ".join(rest)
|
|
|
|
# now extract the ids from the id block
|
|
# gi|5690369|gb|AF158246.1|AF158246
|
|
id_info_items = id_info.split("|")
|
|
if len(id_info_items) >= 4:
|
|
assert id_info_items[2] in ["gb", "emb", "dbj", "pdb"], title
|
|
id = id_info_items[3] # the id with version info
|
|
name = id_info_items[4] # the id without version info
|
|
else:
|
|
# Fallback:
|
|
id = id_info_items[0]
|
|
name = id_info_items[0]
|
|
|
|
return id, name, descr
|
|
|
|
|
|
def read_title_and_seq(filename):
|
|
"""Crude parser that gets the first record from a FASTA file."""
|
|
with open(filename) as handle:
|
|
title = handle.readline().rstrip()
|
|
assert title.startswith(">")
|
|
seq = ""
|
|
for line in handle:
|
|
if line.startswith(">"):
|
|
break
|
|
seq += line.strip()
|
|
return title[1:], seq
|
|
|
|
|
|
class Wrapping(unittest.TestCase):
|
|
"""Tests for two-line-per-record FASTA variant."""
|
|
|
|
def test_fails(self):
|
|
"""Test case which should fail."""
|
|
self.assertRaises(ValueError, SeqIO.read, "Fasta/aster.pro", "fasta-2line")
|
|
|
|
def test_passes(self):
|
|
"""Test case which should pass."""
|
|
expected = SeqIO.read("Fasta/aster.pro", "fasta")
|
|
|
|
record = SeqIO.read("Fasta/aster_no_wrap.pro", "fasta")
|
|
self.assertEqual(expected.id, record.id)
|
|
self.assertEqual(expected.name, record.name)
|
|
self.assertEqual(expected.description, record.description)
|
|
self.assertEqual(expected.seq, record.seq)
|
|
|
|
record = SeqIO.read("Fasta/aster_no_wrap.pro", "fasta-2line")
|
|
self.assertEqual(expected.id, record.id)
|
|
self.assertEqual(expected.name, record.name)
|
|
self.assertEqual(expected.description, record.description)
|
|
self.assertEqual(expected.seq, record.seq)
|
|
|
|
|
|
class TitleFunctions(unittest.TestCase):
|
|
"""Test using title functions."""
|
|
|
|
def simple_check(self, filename):
|
|
"""Test parsing single record FASTA files."""
|
|
msg = f"Test failure parsing file {filename}"
|
|
title, seq = read_title_and_seq(filename) # crude parser
|
|
record = SeqIO.read(filename, "fasta")
|
|
self.assertEqual(record.id, title.split()[0], msg=msg)
|
|
self.assertEqual(record.name, title.split()[0], msg=msg)
|
|
self.assertEqual(record.description, title, msg=msg)
|
|
self.assertEqual(record.seq, seq, msg=msg)
|
|
# Uncomment this for testing the methods are calling the right files:
|
|
# print("{%s done}" % filename)
|
|
|
|
def test_no_name(self):
|
|
"""Test FASTA record with no identifier."""
|
|
handle = StringIO(">\nACGT")
|
|
record = SeqIO.read(handle, "fasta")
|
|
handle.close()
|
|
self.assertEqual(record.seq, "ACGT")
|
|
self.assertEqual("", record.id)
|
|
self.assertEqual("", record.name)
|
|
self.assertEqual("", record.description)
|
|
|
|
def test_single_nucleic_files(self):
|
|
"""Test Fasta files containing a single nucleotide sequence."""
|
|
paths = (
|
|
"Fasta/lupine.nu",
|
|
"Fasta/elderberry.nu",
|
|
"Fasta/phlox.nu",
|
|
"Fasta/centaurea.nu",
|
|
"Fasta/wisteria.nu",
|
|
"Fasta/sweetpea.nu",
|
|
"Fasta/lavender.nu",
|
|
"Fasta/f001",
|
|
)
|
|
for path in paths:
|
|
self.simple_check(path)
|
|
|
|
def test_single_proteino_files(self):
|
|
"""Test Fasta files containing a single protein sequence."""
|
|
paths = (
|
|
"Fasta/aster.pro",
|
|
"Fasta/rosemary.pro",
|
|
"Fasta/rose.pro",
|
|
"Fasta/loveliesbleeding.pro",
|
|
)
|
|
for path in paths:
|
|
self.simple_check(path)
|
|
|
|
|
|
class TestSimpleFastaParsers(unittest.TestCase):
|
|
"""Test SimpleFastaParser and FastaTwoLineParser directly."""
|
|
|
|
# Regular cases input strings and outputs
|
|
ins_two_line = [">1\nACGT", ">1\nACGT", ">1\nACGT\n>2\nACGT"]
|
|
outs_two_line = [[("1", "ACGT")], [("1", "ACGT")], [("1", "ACGT"), ("2", "ACGT")]]
|
|
|
|
ins_multiline = [">1\nACGT\nACGT", ">1\nACGT\nACGT\n>2\nACGT\nACGT"]
|
|
outs_multiline = [[("1", "ACGTACGT")], [("1", "ACGTACGT"), ("2", "ACGTACGT")]]
|
|
|
|
# Edge case input strings and outputs
|
|
ins_two_line_edges = [">\nACGT", ">1\n\n", ">1>1\n\n>1\n\n", ""]
|
|
outs_two_line_edges = [[("", "ACGT")], [("1", "")], [("1>1", ""), ("1", "")], []]
|
|
|
|
ins_simple_edges = [">1", ">1\n\n\n", ">\n>1\n>2"]
|
|
outs_simple_edges = [[("1", "")], [("1", "")], [("", ""), ("1", ""), ("2", "")]]
|
|
|
|
def test_regular_SimpleFastaParser(self):
|
|
"""Test regular SimpleFastaParser cases."""
|
|
for inp, out in zip(self.ins_two_line, self.outs_two_line):
|
|
handle1 = StringIO(inp)
|
|
handle2 = StringIO(inp + "\n")
|
|
self.assertEqual(list(SimpleFastaParser(handle1)), out)
|
|
self.assertEqual(list(SimpleFastaParser(handle2)), out)
|
|
for inp, out in zip(self.ins_multiline, self.outs_multiline):
|
|
handle1 = StringIO(inp)
|
|
handle2 = StringIO(inp + "\n")
|
|
self.assertEqual(list(SimpleFastaParser(handle1)), out)
|
|
self.assertEqual(list(SimpleFastaParser(handle2)), out)
|
|
|
|
def test_regular_FastaTwoLineParser(self):
|
|
"""Test regular FastaTwoLineParser cases."""
|
|
for inp, out in zip(self.ins_two_line, self.outs_two_line):
|
|
handle1 = StringIO(inp)
|
|
handle2 = StringIO(inp + "\n")
|
|
self.assertEqual(list(FastaTwoLineParser(handle1)), out)
|
|
self.assertEqual(list(FastaTwoLineParser(handle2)), out)
|
|
|
|
def test_edgecases_SimpleFastaParser(self):
|
|
"""Test SimpleFastaParser edge-cases."""
|
|
for inp, out in zip(self.ins_two_line_edges, self.outs_two_line_edges):
|
|
handle = StringIO(inp)
|
|
self.assertEqual(list(SimpleFastaParser(handle)), out)
|
|
for inp, out in zip(self.ins_simple_edges, self.outs_simple_edges):
|
|
handle = StringIO(inp)
|
|
self.assertEqual(list(SimpleFastaParser(handle)), out)
|
|
|
|
def test_edgecases_FastaTwoLineParser(self):
|
|
"""Test FastaTwoLineParser edge-cases."""
|
|
for inp, out in zip(self.ins_two_line_edges, self.outs_two_line_edges):
|
|
handle = StringIO(inp)
|
|
self.assertEqual(list(FastaTwoLineParser(handle)), out)
|
|
|
|
def test_exceptions_FastaTwoLineParser(self):
|
|
"""Test FastaTwoLineParser exceptions."""
|
|
for inp in self.ins_multiline + self.ins_simple_edges:
|
|
handle = StringIO(inp)
|
|
with self.assertRaises(ValueError):
|
|
list(FastaTwoLineParser(handle))
|
|
|
|
|
|
class TestFastaWithComments(unittest.TestCase):
|
|
"""Test FastaBlastIterator and FastaPearsonIterator."""
|
|
|
|
expected = SeqIO.read("Fasta/aster.pro", "fasta")
|
|
|
|
def test_fasta_blast(self):
|
|
"""Test FastaBlastIterator."""
|
|
|
|
expected = self.expected
|
|
|
|
record = SeqIO.read("Fasta/aster_blast.pro", "fasta-blast")
|
|
self.assertEqual(expected.id, record.id)
|
|
self.assertEqual(expected.name, record.name)
|
|
self.assertEqual(expected.description, record.description)
|
|
self.assertEqual(expected.seq, record.seq)
|
|
|
|
def test_fasta_pearson(self):
|
|
"""Test FastaPearsonIterator."""
|
|
|
|
expected = self.expected
|
|
|
|
record = SeqIO.read("Fasta/aster_pearson.pro", "fasta-pearson")
|
|
self.assertEqual(expected.id, record.id)
|
|
self.assertEqual(expected.name, record.name)
|
|
self.assertEqual(expected.description, record.description)
|
|
self.assertEqual(expected.seq, record.seq)
|
|
|
|
def test_valueerrors(self):
|
|
"""Test if ValueErrors are raised if comments are found unexpectedly."""
|
|
|
|
self.assertRaises(
|
|
ValueError, SeqIO.read, "Fasta/aster_pearson.pro", "fasta-blast"
|
|
)
|
|
with self.assertWarns(BiopythonDeprecationWarning):
|
|
record = SeqIO.read("Fasta/aster_pearson.pro", "fasta")
|
|
with self.assertWarns(BiopythonDeprecationWarning):
|
|
record = SeqIO.read("Fasta/aster_blast.pro", "fasta")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
runner = unittest.TextTestRunner(verbosity=2)
|
|
unittest.main(testRunner=runner)
|