Files
biopython/Tests/test_SeqIO_features.py
bryan brancotte 79759c8394 Locale independent date serialization (#4977)
Locale independent date serialization, also accept datetime.date

Add warning when date is invalid/not in the expected format

Closes #4972
2025-03-21 15:39:57 +00:00

1375 lines
59 KiB
Python

# Copyright 2009-2016 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""SeqFeature related tests for SeqRecord objects from Bio.SeqIO.
Initially this takes matched tests of GenBank and FASTA files from the NCBI
and confirms they are consistent using our different parsers.
"""
import datetime
import locale
import os
import unittest
import warnings
from io import StringIO
from test_SeqIO import SeqIOTestBaseClass
from Bio import BiopythonWarning
from Bio import SeqIO
from Bio.Data.CodonTable import TranslationError
from Bio.Seq import MutableSeq
from Bio.Seq import reverse_complement
from Bio.Seq import Seq
from Bio.Seq import UndefinedSequenceError
from Bio.SeqFeature import AfterPosition
from Bio.SeqFeature import BeforePosition
from Bio.SeqFeature import CompoundLocation
from Bio.SeqFeature import ExactPosition
from Bio.SeqFeature import OneOfPosition
from Bio.SeqFeature import SeqFeature
from Bio.SeqFeature import SimpleLocation
from Bio.SeqFeature import UnknownPosition
from Bio.SeqFeature import WithinPosition
from Bio.SeqIO.InsdcIO import _insdc_location_string
from Bio.SeqRecord import SeqRecord
def _get_location_string(feature, record_length):
return _insdc_location_string(feature.location, record_length)
def make_join_feature(f_list, ftype="misc_feature"):
# NOTE - Does NOT reorder the sub-features
if len({f.location.strand for f in f_list}) == 1:
strand = f_list[0].location.strand
else:
strand = None
for f in f_list:
f.type = ftype
if strand == -1:
# All reverse strand.
# Put into biological order for CompoundLocation
c_loc = CompoundLocation([f.location for f in f_list[::-1]])
else:
# All forward, or mixed strand
c_loc = CompoundLocation([f.location for f in f_list])
return SeqFeature(c_loc, ftype)
# Prepare a single GenBank record with one feature with a %s place holder for
# the feature location [leaves the source feature in place]
with open("GenBank/iro.gb") as handle:
gbk_template = handle.read()
gbk_template = gbk_template.replace(
' gene 341..756\n /gene="FTCD"\n',
' misc_feature %s\n /note="Test case"\n',
)
gbk_template = gbk_template.replace(
" exon 341..384\n"
' /gene="FTCD"\n'
" /number=1\n",
"",
)
gbk_template = gbk_template.replace(
" intron 385..617\n"
' /gene="FTCD"\n'
" /number=1\n",
"",
)
gbk_template = gbk_template.replace(
" exon 618..756\n"
' /gene="FTCD"\n'
" /number=2\n",
"",
)
assert len(gbk_template) == 4445
assert gbk_template.count("%") == 1, gbk_template
class SeqIOFeatureTestBaseClass(SeqIOTestBaseClass):
def compare_feature(self, old, new, msg=None):
"""Check two SeqFeatures agree."""
self.assertEqual(old.type, new.type, msg=msg)
self.assertEqual(old.location.start, new.location.start, msg=msg)
if old.location.strand is not None:
self.assertEqual(old.location.strand, new.location.strand, msg=msg)
self.assertEqual(old.location.ref, new.location.ref, msg=msg)
self.assertEqual(old.location.ref_db, new.location.ref_db, msg=msg)
self.assertEqual(
getattr(old.location, "operator", None),
getattr(new.location, "operator", None),
msg=msg,
)
self.assertEqual(old.location.start, new.location.start, msg=msg)
self.assertEqual(str(old.location.start), str(new.location.start), msg=msg)
self.assertEqual(old.location.end, new.location.end, msg=msg)
self.assertEqual(str(old.location.end), str(new.location.end), msg=msg)
# This only checks key shared qualifiers
# Would a white list be easier?
# for key in ["name","gene","translation","codon_table","codon_start","locus_tag"]:
for key in set(old.qualifiers).intersection(new.qualifiers):
if key in ["db_xref", "protein_id", "product", "note"]:
# EMBL and GenBank files are use different references/notes/etc
continue
err_msg = f"qualifier mismatch for {key}"
if msg:
err_msg = f"{msg}; {err_msg}"
self.assertEqual(old.qualifiers[key], new.qualifiers[key], msg=err_msg)
def compare_record(self, old, new, msg=None, expect_minor_diffs=False):
# Note the name matching is a bit fuzzy
if not expect_minor_diffs:
err_msg = "'%s' or '%s' vs '%s' or '%s' records" % (
old.id,
old.name,
new.id,
new.name,
)
if msg:
err_msg = f"{msg}; {err_msg}"
self.assertTrue(
old.id == new.id
or old.name == new.name
or old.id in new.id
or new.id in old.id
or old.id.replace(" ", "_") == new.id.replace(" ", "_"),
msg=err_msg,
)
self.assertEqual(len(old.seq), len(new.seq), msg=msg)
if len(old.seq) < 200:
old_seq = old.seq
else:
old_seq = old.seq[:100]
if len(new.seq) < 200:
new_seq = new.seq
else:
new_seq = new.seq[:100]
old_seq_upper = old.seq.upper()
try:
bytes(old_seq)
except UndefinedSequenceError:
old_seq = None
new_seq_upper = new.seq.upper()
try:
bytes(new_seq)
except UndefinedSequenceError:
new_seq = None
err_msg = f"'{old_seq}' vs '{new_seq}'"
if msg:
err_msg = f"{msg}; {err_msg}"
if old_seq is None and new_seq is None:
self.assertEqual(repr(old_seq_upper), repr(new_seq_upper), msg=err_msg)
else:
self.assertEqual(old_seq_upper, new_seq_upper, msg=err_msg)
if old.features and new.features:
self.assertEqual(len(old.features), len(new.features), msg=msg)
for old_feature, new_feature in zip(old.features, new.features):
# This assumes they are in the same order
self.compare_feature(old_feature, new_feature, msg=msg)
# Just insist on at least one word in common:
if old.description or new.description:
words = set(old.description.split()).intersection(new.description.split())
self.assertGreater(len(words), 0, msg=msg)
# This only checks common annotation
# Would a white list be easier?
for key in set(old.annotations).intersection(new.annotations):
if key in ["data_file_division", "accessions"]:
# TODO - These are not yet supported on output, or
# have other complications (e.g. different number of accessions
# allowed in various file formats)
continue
elif key == "molecule_type":
# EMBL allows e.g. "genomics DNA" where GenBank is limited.
common_words = set(old.annotations[key].split())
common_words = common_words.intersection(new.annotations[key].split())
self.assertGreater(len(common_words), 0, msg=msg)
elif key == "comment":
# Ignore whitespace
self.assertEqual(
old.annotations[key].split(), new.annotations[key].split(), msg=msg
)
elif key == "references":
if expect_minor_diffs:
# TODO - Implement EMBL output of references
continue
self.assertEqual(
len(old.annotations[key]), len(new.annotations[key]), msg=msg
)
for r1, r2 in zip(old.annotations[key], new.annotations[key]):
self.assertEqual(r1.title, r2.title, msg=msg)
self.assertEqual(r1.authors, r2.authors, msg=msg)
self.assertEqual(r1.journal, r2.journal, msg=msg)
if r1.consrtm and r2.consrtm:
# Not held in EMBL files
self.assertEqual(r1.consrtm, r2.consrtm, msg=msg)
if r1.medline_id and r2.medline_id:
# Not held in EMBL files
self.assertEqual(r1.medline_id, r2.medline_id, msg=msg)
self.assertEqual(r1.pubmed_id, r2.pubmed_id, msg=msg)
elif key == "date" and (
isinstance(old.annotations[key], datetime.datetime)
or isinstance(old.annotations[key], datetime.date)
):
oa = old.annotations[key]
months = [
"JAN",
"FEB",
"MAR",
"APR",
"MAY",
"JUN",
"JUL",
"AUG",
"SEP",
"OCT",
"NOV",
"DEC",
]
oa = f"{oa.day:02d}-{months[oa.month - 1]}-{oa.year}"
self.assertEqual(oa, new.annotations[key], msg=msg)
else:
self.assertEqual(
repr(old.annotations[key]), repr(new.annotations[key]), msg=msg
)
class GenBankLocations(SeqIOFeatureTestBaseClass):
"""Tests for parsing GenBank feature locations."""
def check_loc(self, expected_location_obj, input_location_str, round_trip=True):
rec = SeqIO.read(StringIO(gbk_template % input_location_str), "gb")
self.assertEqual(len(rec.features), 2)
self.assertEqual(rec.features[0].type, "source")
self.assertEqual(rec.features[1].type, "misc_feature")
# TODO - Do we have object equality defined?
self.assertEqual(str(expected_location_obj), str(rec.features[1].location))
if round_trip:
self.assertEqual(
input_location_str, _get_location_string(rec.features[1], 99999)
)
def test_rev_comp_styles(self):
# These two are equivalent locations
ncbi_str = "complement(join(84..283,1149..1188))"
embl_str = "join(complement(1149..1188),complement(84..283))"
exon1 = SimpleLocation(1148, 1188, strand=-1)
exon2 = SimpleLocation(83, 283, strand=-1)
expected_loc = exon1 + exon2
self.check_loc(expected_loc, ncbi_str)
self.check_loc(expected_loc, embl_str, round_trip=False)
# TODO self.assertEqual(ncbi_str, _get_location_string(loc, ...))
def test_mixed_strand(self):
# Trans-spliced example from NC_000932
loc_str = "join(complement(69611..69724),139856..140087,140625..140650)"
exon1 = SimpleLocation(69610, 69724, strand=-1)
exon2 = SimpleLocation(139855, 140087, strand=+1)
exon3 = SimpleLocation(140624, 140650, strand=+1)
expected_loc = exon1 + exon2 + exon3
self.check_loc(expected_loc, loc_str)
# Made up example putting reverse-complement exon in middle,
loc_str = "join(69611..69724,complement(139856..140087),140625..140650)"
exon1 = SimpleLocation(69610, 69724, strand=+1)
exon2 = SimpleLocation(139855, 140087, strand=-1)
exon3 = SimpleLocation(140624, 140650, strand=+1)
expected_loc = exon1 + exon2 + exon3
self.check_loc(expected_loc, loc_str)
# Made up example putting reverse-complement exon at end,
loc_str = "join(69611..69724,139856..140087,complement(140625..140650))"
exon1 = SimpleLocation(69610, 69724, strand=+1)
exon2 = SimpleLocation(139855, 140087, strand=+1)
exon3 = SimpleLocation(140624, 140650, strand=-1)
expected_loc = exon1 + exon2 + exon3
self.check_loc(expected_loc, loc_str)
# Another made up example
loc_str = (
"join(complement(69611..69724),139856..140087,complement(140625..140650))"
)
exon1 = SimpleLocation(69610, 69724, strand=-1)
exon2 = SimpleLocation(139855, 140087, strand=+1)
exon3 = SimpleLocation(140624, 140650, strand=-1)
expected_loc = exon1 + exon2 + exon3
self.check_loc(expected_loc, loc_str)
class SeqFeatureExtractionWritingReading(SeqIOFeatureTestBaseClass):
"""Tests for SeqFeature sequence extract method, writing, and reading."""
def check(self, parent_seq, feature, answer_str, location_str):
self.assertEqual(location_str, _get_location_string(feature, len(parent_seq)))
new = feature.extract(parent_seq)
self.assertIsInstance(new, Seq)
self.assertEqual(new, answer_str)
new = feature.extract(str(parent_seq))
self.assertIsInstance(new, str)
self.assertEqual(new, answer_str)
new = feature.extract(MutableSeq(parent_seq))
self.assertIsInstance(new, Seq) # Not MutableSeq!
self.assertEqual(new, answer_str)
new = feature.extract(Seq(None, len(parent_seq)))
self.assertIsInstance(new, Seq)
self.assertEqual(len(new), len(answer_str))
if len(answer_str) > 0:
self.assertRaises(UndefinedSequenceError, str, new)
if _get_location_string(feature, 1326) != location_str:
# This is to avoid issues with the N^1 between feature which only
# makes sense at the end of the sequence
return
# This template is DNA, but that will still be OK for protein features
# as they have no strand information... but see below for strand fun
rec = SeqIO.read(StringIO(gbk_template % location_str), "gb")
self.assertEqual(1326, len(rec))
self.assertEqual(2, len(rec.features))
self.assertEqual(rec.features[0].type, "source")
self.assertEqual(rec.features[1].type, "misc_feature")
new_f = rec.features[1]
self.assertEqual(location_str, _get_location_string(new_f, 1326))
feature.type = "misc_feature" # hack as may not be misc_feature
if not feature.location.strand:
feature.location.strand = new_f.location.strand # hack as above
self.compare_feature(feature, new_f)
# Some feature method tests
parent = "ACGT" * 250
s = feature.extract(parent)
self.assertEqual(len(feature), len(s))
for i in feature:
self.assertIn(i, feature)
self.assertEqual(set(feature), {i for i in range(1000) if i in feature})
if feature.location.strand == +1:
self.assertEqual(s, "".join(parent[i] for i in feature))
if len(feature):
self.assertEqual(feature.location.start, min(feature.location))
self.assertEqual(feature.location.end, max(feature.location) + 1)
self.assertLessEqual(
len(feature), feature.location.end - feature.location.start
)
def test_simple_rna(self):
"""Feature on RNA (simple, default strand)."""
s = Seq("GAUCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 10))
self.assertIsNone(f.location.strand)
self.check(s, f, "YWSMK", "6..10")
def test_simple_dna(self):
"""Feature on DNA (simple, default strand)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 10))
self.check(s, f, "YWSMK", "6..10")
def test_single_letter_dna(self):
"""Feature on DNA (single letter, default strand)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 6))
self.check(s, f, "Y", "6")
def test_zero_len_dna(self):
"""Feature on DNA (between location, zero length, default strand)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 5))
self.check(s, f, "", "5^6")
def test_zero_len_dna_end(self):
"""Feature on DNA (between location at end, zero length, default strand)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(15, 15))
self.check(s, f, "", "15^1")
def test_simple_dna_strand0(self):
"""Feature on DNA (simple, strand 0)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 10, strand=0))
self.check(s, f, "YWSMK", "6..10")
def test_simple_dna_strand_none(self):
"""Feature on DNA (simple, strand None)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 10, strand=None))
self.check(s, f, "YWSMK", "6..10")
def test_simple_dna_strand1(self):
"""Feature on DNA (simple, strand +1)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 10, strand=1))
self.assertEqual(f.location.strand, +1)
self.check(s, f, "YWSMK", "6..10")
def test_simple_dna_strand_minus(self):
"""Feature on DNA (simple, strand -1)."""
s = Seq("GATCRYWSMKHBVDN")
f = SeqFeature(SimpleLocation(5, 10, strand=-1))
self.assertEqual(f.location.strand, -1)
self.check(s, f, "MKSWR", "complement(6..10)")
def test_simple_dna_join(self):
"""Feature on DNA (join, strand +1)."""
s = Seq("GATCRYWSMKHBVDN")
f1 = SeqFeature(SimpleLocation(5, 10, strand=1))
f2 = SeqFeature(SimpleLocation(12, 15, strand=1))
f = make_join_feature([f1, f2])
self.check(s, f, "YWSMKVDN", "join(6..10,13..15)")
def test_simple_dna_join_strand_minus(self):
"""Feature on DNA (join, strand -1)."""
s = Seq("AAAAACCCCCTTTTTGGGGG")
f1 = SeqFeature(SimpleLocation(5, 10, strand=-1))
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
f = make_join_feature([f1, f2])
self.check(
s, f, reverse_complement("CCCCCTTT"), "complement(join(6..10,13..15))"
)
def test_simple_dna_join_before(self):
"""Feature on DNA (join, strand -1, before position)."""
s = Seq("AAAAACCCCCTTTTTGGGGG")
f1 = SeqFeature(SimpleLocation(BeforePosition(5), 10, strand=-1))
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
f = make_join_feature([f1, f2])
self.check(
s, f, reverse_complement("CCCCCTTT"), "complement(join(<6..10,13..15))"
)
def test_simple_dna_join_after(self):
"""Feature on DNA (join, strand -1, after position)."""
s = Seq("AAAAACCCCCTTTTTGGGGG")
f1 = SeqFeature(SimpleLocation(5, 10, strand=-1))
f2 = SeqFeature(SimpleLocation(12, AfterPosition(15), strand=-1))
f = make_join_feature([f1, f2])
self.check(
s, f, reverse_complement("CCCCCTTT"), "complement(join(6..10,13..>15))"
)
def test_mixed_strand_dna_join(self):
"""Feature on DNA (join, mixed strand)."""
s = Seq("AAAAACCCCCTTTTTGGGGG")
f1 = SeqFeature(SimpleLocation(5, 10, strand=+1))
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
f = make_join_feature([f1, f2])
self.check(
s, f, "CCCCC" + reverse_complement("TTT"), "join(6..10,complement(13..15))"
)
def test_mixed_strand_dna_multi_join(self):
"""Feature on DNA (multi-join, mixed strand)."""
s = Seq("AAAAACCCCCTTTTTGGGGG")
f1 = SeqFeature(SimpleLocation(5, 10, strand=+1))
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
f3 = SeqFeature(SimpleLocation(BeforePosition(0), 5, strand=+1))
f = make_join_feature([f1, f2, f3])
self.check(
s,
f,
"CCCCC" + reverse_complement("TTT") + "AAAAA",
"join(6..10,complement(13..15),<1..5)",
)
def test_protein_simple(self):
"""Feature on protein (simple)."""
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
f = SeqFeature(SimpleLocation(5, 10))
self.check(s, f, "FGHIJ", "6..10")
def test_protein_join(self):
"""Feature on protein (join)."""
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
f1 = SeqFeature(SimpleLocation(5, 10))
f2 = SeqFeature(SimpleLocation(15, 20))
f = make_join_feature([f1, f2])
self.check(s, f, "FGHIJPQRST", "join(6..10,16..20)")
def test_protein_join_fuzzy(self):
"""Feature on protein (fuzzy join)."""
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
f1 = SeqFeature(SimpleLocation(BeforePosition(5), 10))
f2 = SeqFeature(
SimpleLocation(
OneOfPosition(15, (ExactPosition(15), ExactPosition(16))),
AfterPosition(20),
)
)
f = make_join_feature([f1, f2])
self.check(s, f, "FGHIJPQRST", "join(<6..10,one-of(16,17)..>20)")
def test_protein_multi_join(self):
"""Feature on protein (multi-join)."""
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
f1 = SeqFeature(SimpleLocation(1, 2))
f2 = SeqFeature(SimpleLocation(8, 9))
f3 = SeqFeature(SimpleLocation(14, 16))
f4 = SeqFeature(SimpleLocation(24, 25))
f5 = SeqFeature(SimpleLocation(19, 20))
f6 = SeqFeature(SimpleLocation(7, 8))
f7 = SeqFeature(SimpleLocation(14, 15))
f8 = SeqFeature(SimpleLocation(13, 14))
f = make_join_feature([f1, f2, f3, f4, f5, f6, f7, f8])
self.check(s, f, "BIOPYTHON", "join(2,9,15..16,25,20,8,15,14)")
def test_protein_between(self):
"""Feature on protein (between location, zero length)."""
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
f = SeqFeature(SimpleLocation(5, 5))
self.check(s, f, "", "5^6")
def test_protein_oneof(self):
"""Feature on protein (one-of positions)."""
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
start = OneOfPosition(5, (ExactPosition(5), ExactPosition(7)))
end = OneOfPosition(11, (ExactPosition(10), ExactPosition(11)))
f = SeqFeature(SimpleLocation(start, 10))
self.check(s, f, "FGHIJ", "one-of(6,8)..10")
f = SeqFeature(SimpleLocation(start, end))
self.check(s, f, "FGHIJK", "one-of(6,8)..one-of(10,11)")
f = SeqFeature(SimpleLocation(5, end))
self.check(s, f, "FGHIJK", "6..one-of(10,11)")
class SeqFeatureCreation(unittest.TestCase):
"""Test basic creation of SeqFeatures."""
def test_qualifiers(self):
"""Pass in qualifiers to SeqFeatures."""
f = SeqFeature(SimpleLocation(10, 20, strand=+1), type="CDS")
self.assertEqual(f.qualifiers, {})
f = SeqFeature(
SimpleLocation(10, 20, strand=+1),
type="CDS",
qualifiers={"test": ["a test"]},
)
self.assertEqual(f.qualifiers["test"], ["a test"])
class FeatureWriting(SeqIOFeatureTestBaseClass):
def setUp(self):
self.record = SeqRecord(
Seq("ACGT" * 100), id="Test", name="Test", description="Test"
)
self.record.annotations["molecule_type"] = "DNA"
def write_read_check(self, check_format):
handle = StringIO()
SeqIO.write([self.record], handle, check_format)
handle.seek(0)
record2 = SeqIO.read(handle, check_format)
self.compare_record(self.record, record2)
def write_read_checks(self, formats=("gb", "embl", "imgt")):
for f in formats:
self.write_read_check(f)
def test_exact(self):
"""GenBank/EMBL write/read simple exact locations."""
# Note we don't have to explicitly give an ExactPosition object,
# an integer will also work:
f = SeqFeature(SimpleLocation(10, 20, strand=+1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "11..20")
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(1..10)")
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(81..90)")
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(30, 40, strand=-1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "complement(31..40)")
self.assertEqual(_get_location_string(f._flip(40), 40), "1..10")
self.assertEqual(_get_location_string(f._flip(100), 100), "61..70")
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(ExactPosition(50), ExactPosition(60), strand=+1),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), "51..60")
self.assertEqual(_get_location_string(f._flip(60), 60), "complement(1..10)")
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(41..50)")
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
self.write_read_checks()
# The write/check won't work on strandless features due to the
# limitations of the GenBank (and EMBL) feature location scheme
for s in [0, None]:
# Check flipping of a simple strand 0 feature:
f = SeqFeature(SimpleLocation(0, 100, strand=s), type="source")
self.assertEqual(_get_location_string(f, 100), "1..100")
self.assertEqual(_get_location_string(f._flip(100), 100), "1..100")
self.assertEqual(_get_location_string(f._flip(200), 200), "101..200")
self.assertEqual(f._flip(100).location.strand, f.location.strand)
# Test for compound locations (see https://github.com/biopython/biopython/issues/4611)
# flip with +1/-1 strands does not invert the order in compound locations
loc = SimpleLocation(4, 6, 1) + SimpleLocation(0, 1, 1)
self.assertEqual(str(loc._flip(6)), "join{[0:2](-), [5:6](-)}")
loc = SimpleLocation(4, 6, -1) + SimpleLocation(0, 1, -1)
self.assertEqual(str(loc._flip(6)), "join{[0:2](+), [5:6](+)}")
# flip with None strand inverts the order in compound locations
loc = SimpleLocation(4, 6, None) + SimpleLocation(0, 1, None)
self.assertEqual(str(loc._flip(6)), "join{[5:6], [0:2]}")
def test_between(self):
"""GenBank/EMBL write/read simple between locations."""
# Note we don't use the BetweenPosition any more!
f = SeqFeature(SimpleLocation(10, 10, strand=+1), type="variation")
self.assertEqual(_get_location_string(f, 100), "10^11")
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(10^11)")
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(90^91)")
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(20, 20, strand=-1), type="variation")
self.assertEqual(_get_location_string(f, 100), "complement(20^21)")
self.assertEqual(_get_location_string(f._flip(40), 40), "20^21")
self.assertEqual(_get_location_string(f._flip(100), 100), "80^81")
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
self.write_read_checks()
def test_unknown(self):
"""GenBank/EMBL write/read with unknown end points."""
f = SeqFeature(SimpleLocation(10, 15, strand=+1), type="region")
self.assertEqual(_get_location_string(f, 100), "11..15")
self.record.features.append(f)
f = SeqFeature(SimpleLocation(10, UnknownPosition(), strand=+1), type="region")
self.assertEqual(_get_location_string(f, 100), "11..>11")
self.record.features.append(f)
f = SeqFeature(SimpleLocation(UnknownPosition(), 15, strand=+1), type="region")
self.assertEqual(_get_location_string(f, 100), "<15..15")
self.record.features.append(f)
f = SeqFeature(SimpleLocation(10, 15, strand=-1), type="region")
self.assertEqual(_get_location_string(f, 100), "complement(11..15)")
f = SeqFeature(SimpleLocation(10, UnknownPosition(), strand=-1), type="region")
self.assertEqual(_get_location_string(f, 100), "complement(11..>11)")
self.record.features.append(f)
f = SeqFeature(SimpleLocation(UnknownPosition(), 15, strand=-1), type="region")
self.assertEqual(_get_location_string(f, 100), "complement(<15..15)")
self.record.features.append(f)
# This doesn't round trip
# self.write_read_checks()
def test_join(self):
"""GenBank/EMBL write/read simple join locations."""
f1 = SeqFeature(SimpleLocation(10, 20, strand=+1))
f2 = SeqFeature(SimpleLocation(25, 40, strand=+1))
f = make_join_feature([f1, f2])
self.record.features.append(f)
self.assertEqual(_get_location_string(f, 500), "join(11..20,26..40)")
self.assertEqual(
_get_location_string(f._flip(60), 60), "complement(join(21..35,41..50))"
)
self.assertEqual(
_get_location_string(f._flip(100), 100), "complement(join(61..75,81..90))"
)
self.assertEqual(f._flip(100).location.strand, -1)
for sub_loc in f._flip(100).location.parts:
self.assertEqual(sub_loc.strand, -1)
f1 = SeqFeature(SimpleLocation(110, 120, strand=+1))
f2 = SeqFeature(SimpleLocation(125, 140, strand=+1))
f3 = SeqFeature(SimpleLocation(145, 150, strand=+1))
f = make_join_feature([f1, f2, f3], "CDS")
self.assertEqual(
_get_location_string(f, 500), "join(111..120,126..140,146..150)"
)
self.assertEqual(
_get_location_string(f._flip(150), 150),
"complement(join(1..5,11..25,31..40))",
)
self.assertEqual(f._flip(100).location.strand, -1)
for sub_loc in f._flip(100).location.parts:
self.assertEqual(sub_loc.strand, -1)
self.record.features.append(f)
f1 = SeqFeature(SimpleLocation(210, 220, strand=-1))
f2 = SeqFeature(SimpleLocation(225, 240, strand=-1))
f = make_join_feature([f1, f2], ftype="gene")
self.assertEqual(
_get_location_string(f, 500), "complement(join(211..220,226..240))"
)
self.assertEqual(_get_location_string(f._flip(300), 300), "join(61..75,81..90)")
self.assertEqual(f._flip(100).location.strand, +1)
for sub_loc in f._flip(100).location.parts:
self.assertEqual(sub_loc.strand, +1)
self.record.features.append(f)
f1 = SeqFeature(SimpleLocation(310, 320, strand=-1))
f2 = SeqFeature(SimpleLocation(325, 340, strand=-1))
f3 = SeqFeature(SimpleLocation(345, 350, strand=-1))
f = make_join_feature([f1, f2, f3], "CDS")
self.assertEqual(
_get_location_string(f, 500), "complement(join(311..320,326..340,346..350))"
)
self.assertEqual(
_get_location_string(f._flip(350), 350), "join(1..5,11..25,31..40)"
)
self.assertEqual(f._flip(100).location.strand, +1)
for sub_loc in f._flip(100).location.parts:
self.assertEqual(sub_loc.strand, +1)
self.record.features.append(f)
self.write_read_checks()
def test_fuzzy_join(self):
"""Features: write/read fuzzy join locations."""
s = "N" * 500
f1 = SeqFeature(SimpleLocation(BeforePosition(10), 20, strand=+1))
f2 = SeqFeature(SimpleLocation(25, AfterPosition(40), strand=+1))
f = make_join_feature([f1, f2])
self.record.features.append(f)
self.assertEqual(_get_location_string(f, 500), "join(<11..20,26..>40)")
self.assertEqual(
_get_location_string(f._flip(100), 100), "complement(join(<61..75,81..>90))"
)
self.assertEqual(f.location.strand, +1)
for sub_loc in f.location.parts:
self.assertEqual(sub_loc.strand, +1)
tmp = f._flip(100)
self.assertEqual(tmp.location.strand, -1)
for sub_loc in tmp.location.parts:
self.assertEqual(sub_loc.strand, -1)
f1 = SeqFeature(
SimpleLocation(
OneOfPosition(107, [ExactPosition(107), ExactPosition(110)]),
120,
strand=+1,
),
)
f2 = SeqFeature(SimpleLocation(125, 140, strand=+1))
f3 = SeqFeature(
SimpleLocation(145, WithinPosition(160, left=150, right=160), strand=+1)
)
f = make_join_feature([f1, f2, f3], "CDS")
self.assertEqual(
_get_location_string(f, 500),
"join(one-of(108,111)..120,126..140,146..(150.160))",
)
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(200), 200),
"complement(join((41.51)..55,61..75,81..one-of(90,93)))",
)
self.assertEqual(f.location.strand, +1)
for sub_loc in f.location.parts:
self.assertEqual(sub_loc.strand, +1)
tmp = f._flip(100)
self.assertEqual(tmp.location.strand, -1)
for sub_loc in tmp.location.parts:
self.assertEqual(sub_loc.strand, -1)
self.record.features.append(f)
f1 = SeqFeature(SimpleLocation(BeforePosition(210), 220, strand=-1))
f2 = SeqFeature(
SimpleLocation(225, WithinPosition(244, left=240, right=244), strand=-1)
)
f = make_join_feature([f1, f2], "gene")
self.assertEqual(
_get_location_string(f, 500), "complement(join(<211..220,226..(240.244)))"
)
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(300), 300), "join((57.61)..75,81..>90)"
)
self.assertEqual(f.location.strand, -1)
for sub_loc in f.location.parts:
self.assertEqual(sub_loc.strand, -1)
tmp = f._flip(100)
self.assertEqual(tmp.location.strand, +1)
for sub_loc in tmp.location.parts:
self.assertEqual(sub_loc.strand, +1)
self.record.features.append(f)
f1 = SeqFeature(SimpleLocation(AfterPosition(310), 320, strand=-1))
# Note - is one-of(340,337) allowed or should it be one-of(337,340)?
pos = OneOfPosition(340, [ExactPosition(340), ExactPosition(337)])
f2 = SeqFeature(SimpleLocation(325, pos, strand=-1))
f3 = SeqFeature(
SimpleLocation(345, WithinPosition(355, left=350, right=355), strand=-1)
)
f = make_join_feature([f1, f2, f3], "CDS")
self.assertEqual(
_get_location_string(f, 500),
"complement(join(>311..320,326..one-of(340,337),346..(350.355)))",
)
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(400), 400),
"join((46.51)..55,one-of(64,61)..75,81..<90)",
)
self.assertEqual(f.location.strand, -1)
tmp = f._flip(100)
self.assertEqual(tmp.location.strand, +1)
for sub_loc in tmp.location.parts:
self.assertEqual(sub_loc.strand, +1)
self.record.features.append(f)
self.write_read_checks()
def test_before(self):
"""Features: write/read simple before locations."""
s = "N" * 200
f = SeqFeature(SimpleLocation(BeforePosition(5), 10, strand=+1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "<6..10")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(11..>15)")
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(BeforePosition(15), BeforePosition(20), strand=+1),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), "<16..<20")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(>1..>5)")
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(25, BeforePosition(30), strand=+1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "26..<30")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(40), 40), "complement(>11..15)")
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(BeforePosition(35), 40, strand=-1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "complement(<36..40)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(40), 40), "1..>5")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(BeforePosition(45), BeforePosition(50), strand=-1),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), "complement(<46..<50)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), ">51..>55")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(55, BeforePosition(60), strand=-1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "complement(56..<60)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), ">41..45")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
self.write_read_checks()
def test_after(self):
"""Features: write/read simple after locations."""
s = "N" * 200
f = SeqFeature(SimpleLocation(AfterPosition(5), 10, strand=+1), type="CDS")
self.assertEqual(_get_location_string(f, 100), ">6..10")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(91..<95)")
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(AfterPosition(15), AfterPosition(20), strand=+1),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), ">16..>20")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(<1..<5)")
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(25, AfterPosition(30), strand=+1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "26..>30")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(30), 30), "complement(<1..5)")
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(AfterPosition(35), 40, strand=-1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "complement(>36..40)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), "61..<65")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(AfterPosition(45), AfterPosition(50), strand=-1),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), "complement(>46..>50)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), "<51..<55")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
f = SeqFeature(SimpleLocation(55, AfterPosition(60), strand=-1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "complement(56..>60)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), "<41..45")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
self.write_read_checks()
def test_oneof(self):
"""Features: write/read simple one-of locations."""
s = "N" * 100
start = OneOfPosition(0, [ExactPosition(0), ExactPosition(3), ExactPosition(6)])
f = SeqFeature(SimpleLocation(start, 21, strand=+1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "one-of(1,4,7)..21")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(100), 100), "complement(80..one-of(94,97,100))"
)
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
start = OneOfPosition(10, [ExactPosition(x) for x in [10, 13, 16]])
end = OneOfPosition(50, [ExactPosition(x) for x in [41, 44, 50]])
f = SeqFeature(SimpleLocation(start, end, strand=+1), type="gene")
self.assertEqual(
_get_location_string(f, 100), "one-of(11,14,17)..one-of(41,44,50)"
)
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(50), 50),
"complement(one-of(1,7,10)..one-of(34,37,40))",
)
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
end = OneOfPosition(33, [ExactPosition(x) for x in [30, 33]])
f = SeqFeature(SimpleLocation(27, end, strand=+1), type="gene")
self.assertEqual(_get_location_string(f, 100), "28..one-of(30,33)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(40), 40), "complement(one-of(8,11)..13)"
)
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
start = OneOfPosition(36, [ExactPosition(x) for x in [36, 40]])
f = SeqFeature(SimpleLocation(start, 46, strand=-1), type="CDS")
self.assertEqual(_get_location_string(f, 100), "complement(one-of(37,41)..46)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(50), 50), "5..one-of(10,14)")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
start = OneOfPosition(45, [ExactPosition(x) for x in [45, 60]])
end = OneOfPosition(90, [ExactPosition(x) for x in [70, 90]])
f = SeqFeature(SimpleLocation(start, end, strand=-1), type="CDS")
self.assertEqual(
_get_location_string(f, 100), "complement(one-of(46,61)..one-of(70,90))"
)
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(100), 100), "one-of(11,31)..one-of(40,55)"
)
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
end = OneOfPosition(63, [ExactPosition(x) for x in [60, 63]])
f = SeqFeature(SimpleLocation(55, end, strand=-1), type="tRNA")
self.assertEqual(_get_location_string(f, 100), "complement(56..one-of(60,63))")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), "one-of(38,41)..45")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
self.write_read_checks()
def test_within(self):
"""Features: write/read simple within locations."""
s = "N" * 100
f = SeqFeature(
SimpleLocation(WithinPosition(2, left=2, right=8), 10, strand=+1),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), "(3.9)..10")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(20), 20), "complement(11..(12.18))"
)
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(
WithinPosition(12, left=12, right=18),
WithinPosition(28, left=20, right=28),
strand=+1,
),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), "(13.19)..(20.28)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(30), 30), "complement((3.11)..(12.18))"
)
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(25, WithinPosition(33, left=30, right=33), strand=+1),
type="misc_feature",
)
self.assertEqual(_get_location_string(f, 100), "26..(30.33)")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(
_get_location_string(f._flip(40), 40), "complement((8.11)..15)"
)
self.assertEqual(f.location.strand, +1)
self.assertEqual(f._flip(100).location.strand, -1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(WithinPosition(35, left=35, right=39), 40, strand=-1),
type="rRNA",
)
self.assertEqual(_get_location_string(f, 100), "complement((36.40)..40)")
self.assertEqual(_get_location_string(f._flip(40), 40), "1..(1.5)")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(
WithinPosition(45, left=45, right=47),
WithinPosition(53, left=50, right=53),
strand=-1,
),
type="repeat_region",
)
self.assertEqual(_get_location_string(f, 100), "complement((46.48)..(50.53))")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(60), 60), "(8.11)..(13.15)")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
f = SeqFeature(
SimpleLocation(55, WithinPosition(65, left=60, right=65), strand=-1),
type="CDS",
)
self.assertEqual(_get_location_string(f, 100), "complement(56..(60.65))")
self.assertEqual(len(f), len(f.extract(s)))
self.assertEqual(_get_location_string(f._flip(100), 100), "(36.41)..45")
self.assertEqual(f.location.strand, -1)
self.assertEqual(f._flip(100).location.strand, +1)
self.record.features.append(f)
self.write_read_checks()
def test_datetime(self):
default_locale = locale.setlocale(locale.LC_ALL)
for locale_name in [
"C",
"de_DE.utf8",
"fr_FR.utf8",
"pt_BR.utf8",
]:
try:
locale.setlocale(locale.LC_ALL, locale_name)
# Why october ? Because it is abbreviated to OKT in DE, OCT. in FR, OUT in BR
self.record.annotations["date"] = datetime.datetime.strptime(
"2025-10-01", "%Y-%m-%d"
)
self.write_read_check("gb")
except locale.Error:
pass
locale.setlocale(locale.LC_ALL, default_locale)
def test_date(self):
for m in [1, 10, 12]:
self.record.annotations["date"] = datetime.datetime.strptime(
f"2025-{m}-01", "%Y-%m-%d"
).date()
self.write_read_check("gb")
def test_invalid_date_format(self):
# Using a date in the wrong format
self.record.annotations["date"] = "04-04-1970"
stream = StringIO()
with warnings.catch_warnings(record=True) as w:
SeqIO.write([self.record], stream, "gb")
self.assertEqual(len(w), 1, "a warning should be raised")
self.assertIn("Invalid date format", str(w[0].message))
stream.seek(0)
self.assertIn("1980", stream.getvalue())
self.assertNotIn(self.record.annotations["date"], stream.getvalue())
def test_invalid_date_locale(self):
# Using a date not in english which is not accepted by the writer
self.record.annotations["date"] = "04-OKT-1970"
stream = StringIO()
with warnings.catch_warnings(record=True) as w:
SeqIO.write([self.record], stream, "gb")
self.assertEqual(len(w), 1, "a warning should be raised")
self.assertIn("Invalid date", str(w[0].message))
stream.seek(0)
self.assertIn("1980", stream.getvalue())
self.assertNotIn(self.record.annotations["date"], stream.getvalue())
class NC_000932(SeqIOFeatureTestBaseClass):
# This includes an evil dual strand gene
basename = "NC_000932"
emblname = None # "AP000423" has different annotation (e.g. more CDS)
table = 11
skip_trans_test = [
"gi|7525080|ref|NP_051037.1|", # dual-strand
"gi|7525057|ref|NP_051038.1|", # dual-strand
"gi|90110725|ref|NP_051109.2|", # Invalid annotation? No start codon
]
__doc__ = f"Tests using {basename} GenBank and FASTA files from the NCBI"
# TODO - neat way to change the docstrings...
def setUp(self):
self.gb_filename = os.path.join("GenBank", self.basename + ".gb")
self.ffn_filename = os.path.join("GenBank", self.basename + ".ffn")
self.faa_filename = os.path.join("GenBank", self.basename + ".faa")
self.fna_filename = os.path.join("GenBank", self.basename + ".fna")
if self.emblname:
self.embl_filename = os.path.join("EMBL", self.emblname + ".embl")
# These tests only need the GenBank file and the FAA file:
def test_CDS(self):
"""Checking GenBank CDS translations vs FASTA faa file."""
gb_record = SeqIO.read(self.gb_filename, "genbank")
gb_cds = list(SeqIO.parse(self.gb_filename, "genbank-cds"))
fasta = list(SeqIO.parse(self.faa_filename, "fasta"))
self.compare_records(gb_cds, fasta)
cds_features = [f for f in gb_record.features if f.type == "CDS"]
self.assertEqual(len(cds_features), len(fasta))
for f, r in zip(cds_features, fasta):
if r.id in self.skip_trans_test:
continue
# Get the nucleotides and translate them
nuc = f.extract(gb_record.seq)
self.assertEqual(len(nuc), len(f))
try:
pro = nuc.translate(table=self.table, cds=True)
except TranslationError as e:
msg = f"{e}\n{r.id!r}, {nuc!r}, {self.table!r}\n{f}"
self.fail(msg)
if pro[-1] == "*":
self.assertEqual(pro[:-1], r.seq)
else:
self.assertEqual(pro, r.seq)
class NC_005816(NC_000932):
basename = "NC_005816"
emblname = "AE017046"
table = 11
skip_trans_test = []
__doc__ = f"Tests using {basename} GenBank and FASTA files from the NCBI"
def test_GenBank_vs_EMBL(self):
if not self.emblname:
return
gb_record = SeqIO.read(self.gb_filename, "genbank")
embl_record = SeqIO.read(self.embl_filename, "embl")
if len(embl_record.features) < len(gb_record.features):
# Used to match, but I've update the GenBank files
# which now has lots of misc_feature entries not in EMBL
embl_record.features = [
f for f in embl_record.features if f.type != "misc_feature"
]
gb_record.features = [
f for f in gb_record.features if f.type != "misc_feature"
]
self.compare_record(gb_record, embl_record, expect_minor_diffs=True)
def test_Translations(self):
"""Checking translation of FASTA features (faa vs ffn)."""
faa_records = list(SeqIO.parse(self.faa_filename, "fasta"))
ffn_records = list(SeqIO.parse(self.ffn_filename, "fasta"))
self.assertEqual(len(faa_records), len(ffn_records))
for faa, fna in zip(faa_records, ffn_records):
translation = fna.seq.translate(self.table, cds=True)
if faa.id in self.skip_trans_test:
continue
t = SeqRecord(
translation, id="Translation", description=f"Table {self.table}"
)
msg = "FAA vs FNA translation problem:\n%s\n%s\n%s\n" % (
fna.format("fasta"),
t.format("fasta"),
faa.format("fasta"),
)
self.assertTrue(translation == faa.seq or translation != faa.seq + "*")
def test_Genome(self):
"""Checking GenBank sequence vs FASTA fna file."""
gb_record = SeqIO.read(self.gb_filename, "genbank")
fa_record = SeqIO.read(self.fna_filename, "fasta")
self.compare_record(gb_record, fa_record)
if self.emblname is None:
return
embl_record = SeqIO.read(self.embl_filename, "embl")
if len(embl_record.features) < len(gb_record.features):
# Hack since now out of sync for NC_005816
embl_record.features = [
f for f in embl_record.features if f.type != "misc_feature"
]
gb_record.features = [
f for f in gb_record.features if f.type != "misc_feature"
]
self.compare_record(gb_record, embl_record, expect_minor_diffs=True)
def test_Features(self):
"""Checking GenBank features sequences vs FASTA ffn file."""
gb_record = SeqIO.read(self.gb_filename, "genbank")
features = [f for f in gb_record.features if f.type == "CDS"]
fa_records = list(SeqIO.parse(self.ffn_filename, "fasta"))
self.assertEqual(len(fa_records), len(features))
# This assumes they are in the same order...
for fa_record, f in zip(fa_records, features):
# TODO - check the FASTA ID line against the coordinates?
f_seq = f.extract(gb_record.seq)
self.assertEqual(fa_record.seq, f_seq)
self.assertEqual(len(f_seq), len(f))
class TestWriteRead(SeqIOFeatureTestBaseClass):
"""Test can write and read back files."""
def write_read(self, filename, in_format="gb", out_formats=("gb", "embl", "imgt")):
for out_format in out_formats:
msg = f"Convert {filename} from {in_format} to {out_format}"
gb_records = list(SeqIO.parse(filename, in_format))
# Write it out...
handle = StringIO()
SeqIO.write(gb_records, handle, out_format)
handle.seek(0)
# Now load it back and check it agrees,
gb_records2 = list(SeqIO.parse(handle, out_format))
self.compare_records(gb_records, gb_records2, msg=msg)
def test_NC_000932(self):
"""Write and read back NC_000932.gb."""
self.write_read(os.path.join("GenBank", "NC_000932.gb"), "gb")
def test_NC_005816(self):
"""Write and read back NC_005816.gb."""
self.write_read(os.path.join("GenBank", "NC_005816.gb"), "gb")
def test_gbvrl1_start(self):
"""Write and read back gbvrl1_start.seq."""
self.write_read(os.path.join("GenBank", "gbvrl1_start.seq"), "gb")
def test_NT_019265(self):
"""Write and read back NT_019265.gb."""
self.write_read(os.path.join("GenBank", "NT_019265.gb"), "gb")
def test_cor6(self):
"""Write and read back cor6_6.gb."""
self.write_read(os.path.join("GenBank", "cor6_6.gb"), "gb")
def test_arab1(self):
"""Write and read back arab1.gb."""
self.write_read(os.path.join("GenBank", "arab1.gb"), "gb")
def test_one_of(self):
"""Write and read back of_one.gb."""
self.write_read(os.path.join("GenBank", "one_of.gb"), "gb")
def test_pri1(self):
"""Write and read back pri1.gb."""
self.write_read(os.path.join("GenBank", "pri1.gb"), "gb")
def test_noref(self):
"""Write and read back noref.gb."""
self.write_read(os.path.join("GenBank", "noref.gb"), "gb")
def test_origin_line(self):
"""Write and read back origin_line.gb."""
self.write_read(os.path.join("GenBank", "origin_line.gb"), "gb")
def test_dbsource_wrap(self):
"""Write and read back dbsource_wrap.gb."""
with warnings.catch_warnings():
# Ignore warning about over long DBSOURCE line
warnings.simplefilter("ignore", category=BiopythonWarning)
self.write_read(os.path.join("GenBank", "dbsource_wrap.gb"), "gb", ["gb"])
# Protein so can't convert this to EMBL format
def test_blank_seq(self):
"""Write and read back blank_seq.gb."""
self.write_read(os.path.join("GenBank", "blank_seq.gb"), "gb", ["gb"])
# Protein so can't convert this to EMBL format
def test_extra_keywords(self):
"""Write and read back extra_keywords.gb."""
self.write_read(os.path.join("GenBank", "extra_keywords.gb"), "gb")
def test_protein_refseq(self):
"""Write and read back protein_refseq.gb."""
self.write_read(os.path.join("GenBank", "protein_refseq.gb"), "gb", ["gb"])
# Protein so can't convert this to EMBL format
def test_protein_refseq2(self):
"""Write and read back protein_refseq2.gb."""
self.write_read(os.path.join("GenBank", "protein_refseq2.gb"), "gb", ["gb"])
# Protein so can't convert this to EMBL format
def test_AAA03323(self):
"""Write and read back AAA03323.embl."""
self.write_read(os.path.join("EMBL", "AAA03323.embl"), "embl")
def test_AE017046(self):
"""Write and read back AE017046.embl."""
self.write_read(os.path.join("EMBL", "AE017046.embl"), "embl")
def test_DD231055_edited(self):
"""Write and read back DD231055_edited.embl."""
self.write_read(os.path.join("EMBL", "DD231055_edited.embl"), "embl")
def test_Human_contigs(self):
"""Write and read back Human_contigs.embl."""
self.write_read(os.path.join("EMBL", "Human_contigs.embl"), "embl")
def test_SC10H5(self):
"""Write and read back SC10H5.embl."""
self.write_read(os.path.join("EMBL", "SC10H5.embl"), "embl")
def test_TRBG361(self):
"""Write and read back TRBG361.embl."""
self.write_read(os.path.join("EMBL", "TRBG361.embl"), "embl")
def test_U87107(self):
"""Write and read back U87107.embl."""
self.write_read(os.path.join("EMBL", "U87107.embl"), "embl")
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)