mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Locale independent date serialization, also accept datetime.date Add warning when date is invalid/not in the expected format Closes #4972
1375 lines
59 KiB
Python
1375 lines
59 KiB
Python
# Copyright 2009-2016 by Peter Cock. All rights reserved.
|
|
# This code is part of the Biopython distribution and governed by its
|
|
# license. Please see the LICENSE file that should have been included
|
|
# as part of this package.
|
|
"""SeqFeature related tests for SeqRecord objects from Bio.SeqIO.
|
|
|
|
Initially this takes matched tests of GenBank and FASTA files from the NCBI
|
|
and confirms they are consistent using our different parsers.
|
|
"""
|
|
import datetime
|
|
import locale
|
|
import os
|
|
import unittest
|
|
import warnings
|
|
from io import StringIO
|
|
|
|
from test_SeqIO import SeqIOTestBaseClass
|
|
|
|
from Bio import BiopythonWarning
|
|
from Bio import SeqIO
|
|
from Bio.Data.CodonTable import TranslationError
|
|
from Bio.Seq import MutableSeq
|
|
from Bio.Seq import reverse_complement
|
|
from Bio.Seq import Seq
|
|
from Bio.Seq import UndefinedSequenceError
|
|
from Bio.SeqFeature import AfterPosition
|
|
from Bio.SeqFeature import BeforePosition
|
|
from Bio.SeqFeature import CompoundLocation
|
|
from Bio.SeqFeature import ExactPosition
|
|
from Bio.SeqFeature import OneOfPosition
|
|
from Bio.SeqFeature import SeqFeature
|
|
from Bio.SeqFeature import SimpleLocation
|
|
from Bio.SeqFeature import UnknownPosition
|
|
from Bio.SeqFeature import WithinPosition
|
|
from Bio.SeqIO.InsdcIO import _insdc_location_string
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
|
|
def _get_location_string(feature, record_length):
|
|
return _insdc_location_string(feature.location, record_length)
|
|
|
|
|
|
def make_join_feature(f_list, ftype="misc_feature"):
|
|
# NOTE - Does NOT reorder the sub-features
|
|
if len({f.location.strand for f in f_list}) == 1:
|
|
strand = f_list[0].location.strand
|
|
else:
|
|
strand = None
|
|
for f in f_list:
|
|
f.type = ftype
|
|
if strand == -1:
|
|
# All reverse strand.
|
|
# Put into biological order for CompoundLocation
|
|
c_loc = CompoundLocation([f.location for f in f_list[::-1]])
|
|
else:
|
|
# All forward, or mixed strand
|
|
c_loc = CompoundLocation([f.location for f in f_list])
|
|
return SeqFeature(c_loc, ftype)
|
|
|
|
|
|
# Prepare a single GenBank record with one feature with a %s place holder for
|
|
# the feature location [leaves the source feature in place]
|
|
with open("GenBank/iro.gb") as handle:
|
|
gbk_template = handle.read()
|
|
gbk_template = gbk_template.replace(
|
|
' gene 341..756\n /gene="FTCD"\n',
|
|
' misc_feature %s\n /note="Test case"\n',
|
|
)
|
|
gbk_template = gbk_template.replace(
|
|
" exon 341..384\n"
|
|
' /gene="FTCD"\n'
|
|
" /number=1\n",
|
|
"",
|
|
)
|
|
gbk_template = gbk_template.replace(
|
|
" intron 385..617\n"
|
|
' /gene="FTCD"\n'
|
|
" /number=1\n",
|
|
"",
|
|
)
|
|
gbk_template = gbk_template.replace(
|
|
" exon 618..756\n"
|
|
' /gene="FTCD"\n'
|
|
" /number=2\n",
|
|
"",
|
|
)
|
|
assert len(gbk_template) == 4445
|
|
assert gbk_template.count("%") == 1, gbk_template
|
|
|
|
|
|
class SeqIOFeatureTestBaseClass(SeqIOTestBaseClass):
|
|
def compare_feature(self, old, new, msg=None):
|
|
"""Check two SeqFeatures agree."""
|
|
self.assertEqual(old.type, new.type, msg=msg)
|
|
self.assertEqual(old.location.start, new.location.start, msg=msg)
|
|
if old.location.strand is not None:
|
|
self.assertEqual(old.location.strand, new.location.strand, msg=msg)
|
|
self.assertEqual(old.location.ref, new.location.ref, msg=msg)
|
|
self.assertEqual(old.location.ref_db, new.location.ref_db, msg=msg)
|
|
self.assertEqual(
|
|
getattr(old.location, "operator", None),
|
|
getattr(new.location, "operator", None),
|
|
msg=msg,
|
|
)
|
|
self.assertEqual(old.location.start, new.location.start, msg=msg)
|
|
self.assertEqual(str(old.location.start), str(new.location.start), msg=msg)
|
|
self.assertEqual(old.location.end, new.location.end, msg=msg)
|
|
self.assertEqual(str(old.location.end), str(new.location.end), msg=msg)
|
|
# This only checks key shared qualifiers
|
|
# Would a white list be easier?
|
|
# for key in ["name","gene","translation","codon_table","codon_start","locus_tag"]:
|
|
for key in set(old.qualifiers).intersection(new.qualifiers):
|
|
if key in ["db_xref", "protein_id", "product", "note"]:
|
|
# EMBL and GenBank files are use different references/notes/etc
|
|
continue
|
|
err_msg = f"qualifier mismatch for {key}"
|
|
if msg:
|
|
err_msg = f"{msg}; {err_msg}"
|
|
self.assertEqual(old.qualifiers[key], new.qualifiers[key], msg=err_msg)
|
|
|
|
def compare_record(self, old, new, msg=None, expect_minor_diffs=False):
|
|
# Note the name matching is a bit fuzzy
|
|
if not expect_minor_diffs:
|
|
err_msg = "'%s' or '%s' vs '%s' or '%s' records" % (
|
|
old.id,
|
|
old.name,
|
|
new.id,
|
|
new.name,
|
|
)
|
|
if msg:
|
|
err_msg = f"{msg}; {err_msg}"
|
|
self.assertTrue(
|
|
old.id == new.id
|
|
or old.name == new.name
|
|
or old.id in new.id
|
|
or new.id in old.id
|
|
or old.id.replace(" ", "_") == new.id.replace(" ", "_"),
|
|
msg=err_msg,
|
|
)
|
|
self.assertEqual(len(old.seq), len(new.seq), msg=msg)
|
|
if len(old.seq) < 200:
|
|
old_seq = old.seq
|
|
else:
|
|
old_seq = old.seq[:100]
|
|
if len(new.seq) < 200:
|
|
new_seq = new.seq
|
|
else:
|
|
new_seq = new.seq[:100]
|
|
old_seq_upper = old.seq.upper()
|
|
try:
|
|
bytes(old_seq)
|
|
except UndefinedSequenceError:
|
|
old_seq = None
|
|
new_seq_upper = new.seq.upper()
|
|
try:
|
|
bytes(new_seq)
|
|
except UndefinedSequenceError:
|
|
new_seq = None
|
|
err_msg = f"'{old_seq}' vs '{new_seq}'"
|
|
if msg:
|
|
err_msg = f"{msg}; {err_msg}"
|
|
if old_seq is None and new_seq is None:
|
|
self.assertEqual(repr(old_seq_upper), repr(new_seq_upper), msg=err_msg)
|
|
else:
|
|
self.assertEqual(old_seq_upper, new_seq_upper, msg=err_msg)
|
|
if old.features and new.features:
|
|
self.assertEqual(len(old.features), len(new.features), msg=msg)
|
|
for old_feature, new_feature in zip(old.features, new.features):
|
|
# This assumes they are in the same order
|
|
self.compare_feature(old_feature, new_feature, msg=msg)
|
|
|
|
# Just insist on at least one word in common:
|
|
if old.description or new.description:
|
|
words = set(old.description.split()).intersection(new.description.split())
|
|
self.assertGreater(len(words), 0, msg=msg)
|
|
# This only checks common annotation
|
|
# Would a white list be easier?
|
|
for key in set(old.annotations).intersection(new.annotations):
|
|
if key in ["data_file_division", "accessions"]:
|
|
# TODO - These are not yet supported on output, or
|
|
# have other complications (e.g. different number of accessions
|
|
# allowed in various file formats)
|
|
continue
|
|
elif key == "molecule_type":
|
|
# EMBL allows e.g. "genomics DNA" where GenBank is limited.
|
|
common_words = set(old.annotations[key].split())
|
|
common_words = common_words.intersection(new.annotations[key].split())
|
|
self.assertGreater(len(common_words), 0, msg=msg)
|
|
elif key == "comment":
|
|
# Ignore whitespace
|
|
self.assertEqual(
|
|
old.annotations[key].split(), new.annotations[key].split(), msg=msg
|
|
)
|
|
elif key == "references":
|
|
if expect_minor_diffs:
|
|
# TODO - Implement EMBL output of references
|
|
continue
|
|
self.assertEqual(
|
|
len(old.annotations[key]), len(new.annotations[key]), msg=msg
|
|
)
|
|
for r1, r2 in zip(old.annotations[key], new.annotations[key]):
|
|
self.assertEqual(r1.title, r2.title, msg=msg)
|
|
self.assertEqual(r1.authors, r2.authors, msg=msg)
|
|
self.assertEqual(r1.journal, r2.journal, msg=msg)
|
|
if r1.consrtm and r2.consrtm:
|
|
# Not held in EMBL files
|
|
self.assertEqual(r1.consrtm, r2.consrtm, msg=msg)
|
|
if r1.medline_id and r2.medline_id:
|
|
# Not held in EMBL files
|
|
self.assertEqual(r1.medline_id, r2.medline_id, msg=msg)
|
|
self.assertEqual(r1.pubmed_id, r2.pubmed_id, msg=msg)
|
|
elif key == "date" and (
|
|
isinstance(old.annotations[key], datetime.datetime)
|
|
or isinstance(old.annotations[key], datetime.date)
|
|
):
|
|
oa = old.annotations[key]
|
|
months = [
|
|
"JAN",
|
|
"FEB",
|
|
"MAR",
|
|
"APR",
|
|
"MAY",
|
|
"JUN",
|
|
"JUL",
|
|
"AUG",
|
|
"SEP",
|
|
"OCT",
|
|
"NOV",
|
|
"DEC",
|
|
]
|
|
oa = f"{oa.day:02d}-{months[oa.month - 1]}-{oa.year}"
|
|
self.assertEqual(oa, new.annotations[key], msg=msg)
|
|
else:
|
|
self.assertEqual(
|
|
repr(old.annotations[key]), repr(new.annotations[key]), msg=msg
|
|
)
|
|
|
|
|
|
class GenBankLocations(SeqIOFeatureTestBaseClass):
|
|
"""Tests for parsing GenBank feature locations."""
|
|
|
|
def check_loc(self, expected_location_obj, input_location_str, round_trip=True):
|
|
rec = SeqIO.read(StringIO(gbk_template % input_location_str), "gb")
|
|
self.assertEqual(len(rec.features), 2)
|
|
self.assertEqual(rec.features[0].type, "source")
|
|
self.assertEqual(rec.features[1].type, "misc_feature")
|
|
# TODO - Do we have object equality defined?
|
|
self.assertEqual(str(expected_location_obj), str(rec.features[1].location))
|
|
if round_trip:
|
|
self.assertEqual(
|
|
input_location_str, _get_location_string(rec.features[1], 99999)
|
|
)
|
|
|
|
def test_rev_comp_styles(self):
|
|
# These two are equivalent locations
|
|
ncbi_str = "complement(join(84..283,1149..1188))"
|
|
embl_str = "join(complement(1149..1188),complement(84..283))"
|
|
exon1 = SimpleLocation(1148, 1188, strand=-1)
|
|
exon2 = SimpleLocation(83, 283, strand=-1)
|
|
expected_loc = exon1 + exon2
|
|
self.check_loc(expected_loc, ncbi_str)
|
|
self.check_loc(expected_loc, embl_str, round_trip=False)
|
|
# TODO self.assertEqual(ncbi_str, _get_location_string(loc, ...))
|
|
|
|
def test_mixed_strand(self):
|
|
# Trans-spliced example from NC_000932
|
|
loc_str = "join(complement(69611..69724),139856..140087,140625..140650)"
|
|
exon1 = SimpleLocation(69610, 69724, strand=-1)
|
|
exon2 = SimpleLocation(139855, 140087, strand=+1)
|
|
exon3 = SimpleLocation(140624, 140650, strand=+1)
|
|
expected_loc = exon1 + exon2 + exon3
|
|
self.check_loc(expected_loc, loc_str)
|
|
# Made up example putting reverse-complement exon in middle,
|
|
loc_str = "join(69611..69724,complement(139856..140087),140625..140650)"
|
|
exon1 = SimpleLocation(69610, 69724, strand=+1)
|
|
exon2 = SimpleLocation(139855, 140087, strand=-1)
|
|
exon3 = SimpleLocation(140624, 140650, strand=+1)
|
|
expected_loc = exon1 + exon2 + exon3
|
|
self.check_loc(expected_loc, loc_str)
|
|
# Made up example putting reverse-complement exon at end,
|
|
loc_str = "join(69611..69724,139856..140087,complement(140625..140650))"
|
|
exon1 = SimpleLocation(69610, 69724, strand=+1)
|
|
exon2 = SimpleLocation(139855, 140087, strand=+1)
|
|
exon3 = SimpleLocation(140624, 140650, strand=-1)
|
|
expected_loc = exon1 + exon2 + exon3
|
|
self.check_loc(expected_loc, loc_str)
|
|
# Another made up example
|
|
loc_str = (
|
|
"join(complement(69611..69724),139856..140087,complement(140625..140650))"
|
|
)
|
|
exon1 = SimpleLocation(69610, 69724, strand=-1)
|
|
exon2 = SimpleLocation(139855, 140087, strand=+1)
|
|
exon3 = SimpleLocation(140624, 140650, strand=-1)
|
|
expected_loc = exon1 + exon2 + exon3
|
|
self.check_loc(expected_loc, loc_str)
|
|
|
|
|
|
class SeqFeatureExtractionWritingReading(SeqIOFeatureTestBaseClass):
|
|
"""Tests for SeqFeature sequence extract method, writing, and reading."""
|
|
|
|
def check(self, parent_seq, feature, answer_str, location_str):
|
|
self.assertEqual(location_str, _get_location_string(feature, len(parent_seq)))
|
|
|
|
new = feature.extract(parent_seq)
|
|
self.assertIsInstance(new, Seq)
|
|
self.assertEqual(new, answer_str)
|
|
|
|
new = feature.extract(str(parent_seq))
|
|
self.assertIsInstance(new, str)
|
|
self.assertEqual(new, answer_str)
|
|
|
|
new = feature.extract(MutableSeq(parent_seq))
|
|
self.assertIsInstance(new, Seq) # Not MutableSeq!
|
|
self.assertEqual(new, answer_str)
|
|
|
|
new = feature.extract(Seq(None, len(parent_seq)))
|
|
self.assertIsInstance(new, Seq)
|
|
self.assertEqual(len(new), len(answer_str))
|
|
if len(answer_str) > 0:
|
|
self.assertRaises(UndefinedSequenceError, str, new)
|
|
|
|
if _get_location_string(feature, 1326) != location_str:
|
|
# This is to avoid issues with the N^1 between feature which only
|
|
# makes sense at the end of the sequence
|
|
return
|
|
# This template is DNA, but that will still be OK for protein features
|
|
# as they have no strand information... but see below for strand fun
|
|
rec = SeqIO.read(StringIO(gbk_template % location_str), "gb")
|
|
self.assertEqual(1326, len(rec))
|
|
self.assertEqual(2, len(rec.features))
|
|
self.assertEqual(rec.features[0].type, "source")
|
|
self.assertEqual(rec.features[1].type, "misc_feature")
|
|
new_f = rec.features[1]
|
|
self.assertEqual(location_str, _get_location_string(new_f, 1326))
|
|
|
|
feature.type = "misc_feature" # hack as may not be misc_feature
|
|
if not feature.location.strand:
|
|
feature.location.strand = new_f.location.strand # hack as above
|
|
self.compare_feature(feature, new_f)
|
|
|
|
# Some feature method tests
|
|
parent = "ACGT" * 250
|
|
s = feature.extract(parent)
|
|
self.assertEqual(len(feature), len(s))
|
|
for i in feature:
|
|
self.assertIn(i, feature)
|
|
self.assertEqual(set(feature), {i for i in range(1000) if i in feature})
|
|
if feature.location.strand == +1:
|
|
self.assertEqual(s, "".join(parent[i] for i in feature))
|
|
if len(feature):
|
|
self.assertEqual(feature.location.start, min(feature.location))
|
|
self.assertEqual(feature.location.end, max(feature.location) + 1)
|
|
self.assertLessEqual(
|
|
len(feature), feature.location.end - feature.location.start
|
|
)
|
|
|
|
def test_simple_rna(self):
|
|
"""Feature on RNA (simple, default strand)."""
|
|
s = Seq("GAUCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 10))
|
|
self.assertIsNone(f.location.strand)
|
|
self.check(s, f, "YWSMK", "6..10")
|
|
|
|
def test_simple_dna(self):
|
|
"""Feature on DNA (simple, default strand)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 10))
|
|
self.check(s, f, "YWSMK", "6..10")
|
|
|
|
def test_single_letter_dna(self):
|
|
"""Feature on DNA (single letter, default strand)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 6))
|
|
self.check(s, f, "Y", "6")
|
|
|
|
def test_zero_len_dna(self):
|
|
"""Feature on DNA (between location, zero length, default strand)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 5))
|
|
self.check(s, f, "", "5^6")
|
|
|
|
def test_zero_len_dna_end(self):
|
|
"""Feature on DNA (between location at end, zero length, default strand)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(15, 15))
|
|
self.check(s, f, "", "15^1")
|
|
|
|
def test_simple_dna_strand0(self):
|
|
"""Feature on DNA (simple, strand 0)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 10, strand=0))
|
|
self.check(s, f, "YWSMK", "6..10")
|
|
|
|
def test_simple_dna_strand_none(self):
|
|
"""Feature on DNA (simple, strand None)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 10, strand=None))
|
|
self.check(s, f, "YWSMK", "6..10")
|
|
|
|
def test_simple_dna_strand1(self):
|
|
"""Feature on DNA (simple, strand +1)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 10, strand=1))
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.check(s, f, "YWSMK", "6..10")
|
|
|
|
def test_simple_dna_strand_minus(self):
|
|
"""Feature on DNA (simple, strand -1)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f = SeqFeature(SimpleLocation(5, 10, strand=-1))
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.check(s, f, "MKSWR", "complement(6..10)")
|
|
|
|
def test_simple_dna_join(self):
|
|
"""Feature on DNA (join, strand +1)."""
|
|
s = Seq("GATCRYWSMKHBVDN")
|
|
f1 = SeqFeature(SimpleLocation(5, 10, strand=1))
|
|
f2 = SeqFeature(SimpleLocation(12, 15, strand=1))
|
|
f = make_join_feature([f1, f2])
|
|
self.check(s, f, "YWSMKVDN", "join(6..10,13..15)")
|
|
|
|
def test_simple_dna_join_strand_minus(self):
|
|
"""Feature on DNA (join, strand -1)."""
|
|
s = Seq("AAAAACCCCCTTTTTGGGGG")
|
|
f1 = SeqFeature(SimpleLocation(5, 10, strand=-1))
|
|
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
|
|
f = make_join_feature([f1, f2])
|
|
self.check(
|
|
s, f, reverse_complement("CCCCCTTT"), "complement(join(6..10,13..15))"
|
|
)
|
|
|
|
def test_simple_dna_join_before(self):
|
|
"""Feature on DNA (join, strand -1, before position)."""
|
|
s = Seq("AAAAACCCCCTTTTTGGGGG")
|
|
f1 = SeqFeature(SimpleLocation(BeforePosition(5), 10, strand=-1))
|
|
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
|
|
f = make_join_feature([f1, f2])
|
|
self.check(
|
|
s, f, reverse_complement("CCCCCTTT"), "complement(join(<6..10,13..15))"
|
|
)
|
|
|
|
def test_simple_dna_join_after(self):
|
|
"""Feature on DNA (join, strand -1, after position)."""
|
|
s = Seq("AAAAACCCCCTTTTTGGGGG")
|
|
f1 = SeqFeature(SimpleLocation(5, 10, strand=-1))
|
|
f2 = SeqFeature(SimpleLocation(12, AfterPosition(15), strand=-1))
|
|
f = make_join_feature([f1, f2])
|
|
self.check(
|
|
s, f, reverse_complement("CCCCCTTT"), "complement(join(6..10,13..>15))"
|
|
)
|
|
|
|
def test_mixed_strand_dna_join(self):
|
|
"""Feature on DNA (join, mixed strand)."""
|
|
s = Seq("AAAAACCCCCTTTTTGGGGG")
|
|
f1 = SeqFeature(SimpleLocation(5, 10, strand=+1))
|
|
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
|
|
f = make_join_feature([f1, f2])
|
|
self.check(
|
|
s, f, "CCCCC" + reverse_complement("TTT"), "join(6..10,complement(13..15))"
|
|
)
|
|
|
|
def test_mixed_strand_dna_multi_join(self):
|
|
"""Feature on DNA (multi-join, mixed strand)."""
|
|
s = Seq("AAAAACCCCCTTTTTGGGGG")
|
|
f1 = SeqFeature(SimpleLocation(5, 10, strand=+1))
|
|
f2 = SeqFeature(SimpleLocation(12, 15, strand=-1))
|
|
f3 = SeqFeature(SimpleLocation(BeforePosition(0), 5, strand=+1))
|
|
f = make_join_feature([f1, f2, f3])
|
|
self.check(
|
|
s,
|
|
f,
|
|
"CCCCC" + reverse_complement("TTT") + "AAAAA",
|
|
"join(6..10,complement(13..15),<1..5)",
|
|
)
|
|
|
|
def test_protein_simple(self):
|
|
"""Feature on protein (simple)."""
|
|
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
f = SeqFeature(SimpleLocation(5, 10))
|
|
self.check(s, f, "FGHIJ", "6..10")
|
|
|
|
def test_protein_join(self):
|
|
"""Feature on protein (join)."""
|
|
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
f1 = SeqFeature(SimpleLocation(5, 10))
|
|
f2 = SeqFeature(SimpleLocation(15, 20))
|
|
f = make_join_feature([f1, f2])
|
|
self.check(s, f, "FGHIJPQRST", "join(6..10,16..20)")
|
|
|
|
def test_protein_join_fuzzy(self):
|
|
"""Feature on protein (fuzzy join)."""
|
|
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
f1 = SeqFeature(SimpleLocation(BeforePosition(5), 10))
|
|
f2 = SeqFeature(
|
|
SimpleLocation(
|
|
OneOfPosition(15, (ExactPosition(15), ExactPosition(16))),
|
|
AfterPosition(20),
|
|
)
|
|
)
|
|
f = make_join_feature([f1, f2])
|
|
self.check(s, f, "FGHIJPQRST", "join(<6..10,one-of(16,17)..>20)")
|
|
|
|
def test_protein_multi_join(self):
|
|
"""Feature on protein (multi-join)."""
|
|
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
f1 = SeqFeature(SimpleLocation(1, 2))
|
|
f2 = SeqFeature(SimpleLocation(8, 9))
|
|
f3 = SeqFeature(SimpleLocation(14, 16))
|
|
f4 = SeqFeature(SimpleLocation(24, 25))
|
|
f5 = SeqFeature(SimpleLocation(19, 20))
|
|
f6 = SeqFeature(SimpleLocation(7, 8))
|
|
f7 = SeqFeature(SimpleLocation(14, 15))
|
|
f8 = SeqFeature(SimpleLocation(13, 14))
|
|
f = make_join_feature([f1, f2, f3, f4, f5, f6, f7, f8])
|
|
self.check(s, f, "BIOPYTHON", "join(2,9,15..16,25,20,8,15,14)")
|
|
|
|
def test_protein_between(self):
|
|
"""Feature on protein (between location, zero length)."""
|
|
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
f = SeqFeature(SimpleLocation(5, 5))
|
|
self.check(s, f, "", "5^6")
|
|
|
|
def test_protein_oneof(self):
|
|
"""Feature on protein (one-of positions)."""
|
|
s = Seq("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
start = OneOfPosition(5, (ExactPosition(5), ExactPosition(7)))
|
|
end = OneOfPosition(11, (ExactPosition(10), ExactPosition(11)))
|
|
f = SeqFeature(SimpleLocation(start, 10))
|
|
self.check(s, f, "FGHIJ", "one-of(6,8)..10")
|
|
f = SeqFeature(SimpleLocation(start, end))
|
|
self.check(s, f, "FGHIJK", "one-of(6,8)..one-of(10,11)")
|
|
f = SeqFeature(SimpleLocation(5, end))
|
|
self.check(s, f, "FGHIJK", "6..one-of(10,11)")
|
|
|
|
|
|
class SeqFeatureCreation(unittest.TestCase):
|
|
"""Test basic creation of SeqFeatures."""
|
|
|
|
def test_qualifiers(self):
|
|
"""Pass in qualifiers to SeqFeatures."""
|
|
f = SeqFeature(SimpleLocation(10, 20, strand=+1), type="CDS")
|
|
self.assertEqual(f.qualifiers, {})
|
|
f = SeqFeature(
|
|
SimpleLocation(10, 20, strand=+1),
|
|
type="CDS",
|
|
qualifiers={"test": ["a test"]},
|
|
)
|
|
self.assertEqual(f.qualifiers["test"], ["a test"])
|
|
|
|
|
|
class FeatureWriting(SeqIOFeatureTestBaseClass):
|
|
def setUp(self):
|
|
self.record = SeqRecord(
|
|
Seq("ACGT" * 100), id="Test", name="Test", description="Test"
|
|
)
|
|
self.record.annotations["molecule_type"] = "DNA"
|
|
|
|
def write_read_check(self, check_format):
|
|
handle = StringIO()
|
|
SeqIO.write([self.record], handle, check_format)
|
|
handle.seek(0)
|
|
record2 = SeqIO.read(handle, check_format)
|
|
self.compare_record(self.record, record2)
|
|
|
|
def write_read_checks(self, formats=("gb", "embl", "imgt")):
|
|
for f in formats:
|
|
self.write_read_check(f)
|
|
|
|
def test_exact(self):
|
|
"""GenBank/EMBL write/read simple exact locations."""
|
|
# Note we don't have to explicitly give an ExactPosition object,
|
|
# an integer will also work:
|
|
f = SeqFeature(SimpleLocation(10, 20, strand=+1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "11..20")
|
|
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(1..10)")
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(81..90)")
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(SimpleLocation(30, 40, strand=-1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(31..40)")
|
|
self.assertEqual(_get_location_string(f._flip(40), 40), "1..10")
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "61..70")
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(ExactPosition(50), ExactPosition(60), strand=+1),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "51..60")
|
|
self.assertEqual(_get_location_string(f._flip(60), 60), "complement(1..10)")
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(41..50)")
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
self.write_read_checks()
|
|
# The write/check won't work on strandless features due to the
|
|
# limitations of the GenBank (and EMBL) feature location scheme
|
|
for s in [0, None]:
|
|
# Check flipping of a simple strand 0 feature:
|
|
f = SeqFeature(SimpleLocation(0, 100, strand=s), type="source")
|
|
self.assertEqual(_get_location_string(f, 100), "1..100")
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "1..100")
|
|
self.assertEqual(_get_location_string(f._flip(200), 200), "101..200")
|
|
self.assertEqual(f._flip(100).location.strand, f.location.strand)
|
|
|
|
# Test for compound locations (see https://github.com/biopython/biopython/issues/4611)
|
|
|
|
# flip with +1/-1 strands does not invert the order in compound locations
|
|
loc = SimpleLocation(4, 6, 1) + SimpleLocation(0, 1, 1)
|
|
self.assertEqual(str(loc._flip(6)), "join{[0:2](-), [5:6](-)}")
|
|
loc = SimpleLocation(4, 6, -1) + SimpleLocation(0, 1, -1)
|
|
self.assertEqual(str(loc._flip(6)), "join{[0:2](+), [5:6](+)}")
|
|
|
|
# flip with None strand inverts the order in compound locations
|
|
loc = SimpleLocation(4, 6, None) + SimpleLocation(0, 1, None)
|
|
self.assertEqual(str(loc._flip(6)), "join{[5:6], [0:2]}")
|
|
|
|
def test_between(self):
|
|
"""GenBank/EMBL write/read simple between locations."""
|
|
# Note we don't use the BetweenPosition any more!
|
|
f = SeqFeature(SimpleLocation(10, 10, strand=+1), type="variation")
|
|
self.assertEqual(_get_location_string(f, 100), "10^11")
|
|
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(10^11)")
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(90^91)")
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
f = SeqFeature(SimpleLocation(20, 20, strand=-1), type="variation")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(20^21)")
|
|
self.assertEqual(_get_location_string(f._flip(40), 40), "20^21")
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "80^81")
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
self.write_read_checks()
|
|
|
|
def test_unknown(self):
|
|
"""GenBank/EMBL write/read with unknown end points."""
|
|
f = SeqFeature(SimpleLocation(10, 15, strand=+1), type="region")
|
|
self.assertEqual(_get_location_string(f, 100), "11..15")
|
|
self.record.features.append(f)
|
|
f = SeqFeature(SimpleLocation(10, UnknownPosition(), strand=+1), type="region")
|
|
self.assertEqual(_get_location_string(f, 100), "11..>11")
|
|
self.record.features.append(f)
|
|
f = SeqFeature(SimpleLocation(UnknownPosition(), 15, strand=+1), type="region")
|
|
self.assertEqual(_get_location_string(f, 100), "<15..15")
|
|
self.record.features.append(f)
|
|
f = SeqFeature(SimpleLocation(10, 15, strand=-1), type="region")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(11..15)")
|
|
f = SeqFeature(SimpleLocation(10, UnknownPosition(), strand=-1), type="region")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(11..>11)")
|
|
self.record.features.append(f)
|
|
f = SeqFeature(SimpleLocation(UnknownPosition(), 15, strand=-1), type="region")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(<15..15)")
|
|
self.record.features.append(f)
|
|
# This doesn't round trip
|
|
# self.write_read_checks()
|
|
|
|
def test_join(self):
|
|
"""GenBank/EMBL write/read simple join locations."""
|
|
f1 = SeqFeature(SimpleLocation(10, 20, strand=+1))
|
|
f2 = SeqFeature(SimpleLocation(25, 40, strand=+1))
|
|
f = make_join_feature([f1, f2])
|
|
self.record.features.append(f)
|
|
self.assertEqual(_get_location_string(f, 500), "join(11..20,26..40)")
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(60), 60), "complement(join(21..35,41..50))"
|
|
)
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(100), 100), "complement(join(61..75,81..90))"
|
|
)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
for sub_loc in f._flip(100).location.parts:
|
|
self.assertEqual(sub_loc.strand, -1)
|
|
f1 = SeqFeature(SimpleLocation(110, 120, strand=+1))
|
|
f2 = SeqFeature(SimpleLocation(125, 140, strand=+1))
|
|
f3 = SeqFeature(SimpleLocation(145, 150, strand=+1))
|
|
f = make_join_feature([f1, f2, f3], "CDS")
|
|
self.assertEqual(
|
|
_get_location_string(f, 500), "join(111..120,126..140,146..150)"
|
|
)
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(150), 150),
|
|
"complement(join(1..5,11..25,31..40))",
|
|
)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
for sub_loc in f._flip(100).location.parts:
|
|
self.assertEqual(sub_loc.strand, -1)
|
|
self.record.features.append(f)
|
|
f1 = SeqFeature(SimpleLocation(210, 220, strand=-1))
|
|
f2 = SeqFeature(SimpleLocation(225, 240, strand=-1))
|
|
f = make_join_feature([f1, f2], ftype="gene")
|
|
self.assertEqual(
|
|
_get_location_string(f, 500), "complement(join(211..220,226..240))"
|
|
)
|
|
self.assertEqual(_get_location_string(f._flip(300), 300), "join(61..75,81..90)")
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
for sub_loc in f._flip(100).location.parts:
|
|
self.assertEqual(sub_loc.strand, +1)
|
|
self.record.features.append(f)
|
|
f1 = SeqFeature(SimpleLocation(310, 320, strand=-1))
|
|
f2 = SeqFeature(SimpleLocation(325, 340, strand=-1))
|
|
f3 = SeqFeature(SimpleLocation(345, 350, strand=-1))
|
|
f = make_join_feature([f1, f2, f3], "CDS")
|
|
self.assertEqual(
|
|
_get_location_string(f, 500), "complement(join(311..320,326..340,346..350))"
|
|
)
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(350), 350), "join(1..5,11..25,31..40)"
|
|
)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
for sub_loc in f._flip(100).location.parts:
|
|
self.assertEqual(sub_loc.strand, +1)
|
|
self.record.features.append(f)
|
|
self.write_read_checks()
|
|
|
|
def test_fuzzy_join(self):
|
|
"""Features: write/read fuzzy join locations."""
|
|
s = "N" * 500
|
|
f1 = SeqFeature(SimpleLocation(BeforePosition(10), 20, strand=+1))
|
|
f2 = SeqFeature(SimpleLocation(25, AfterPosition(40), strand=+1))
|
|
f = make_join_feature([f1, f2])
|
|
self.record.features.append(f)
|
|
self.assertEqual(_get_location_string(f, 500), "join(<11..20,26..>40)")
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(100), 100), "complement(join(<61..75,81..>90))"
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
for sub_loc in f.location.parts:
|
|
self.assertEqual(sub_loc.strand, +1)
|
|
tmp = f._flip(100)
|
|
self.assertEqual(tmp.location.strand, -1)
|
|
for sub_loc in tmp.location.parts:
|
|
self.assertEqual(sub_loc.strand, -1)
|
|
|
|
f1 = SeqFeature(
|
|
SimpleLocation(
|
|
OneOfPosition(107, [ExactPosition(107), ExactPosition(110)]),
|
|
120,
|
|
strand=+1,
|
|
),
|
|
)
|
|
f2 = SeqFeature(SimpleLocation(125, 140, strand=+1))
|
|
f3 = SeqFeature(
|
|
SimpleLocation(145, WithinPosition(160, left=150, right=160), strand=+1)
|
|
)
|
|
f = make_join_feature([f1, f2, f3], "CDS")
|
|
self.assertEqual(
|
|
_get_location_string(f, 500),
|
|
"join(one-of(108,111)..120,126..140,146..(150.160))",
|
|
)
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(200), 200),
|
|
"complement(join((41.51)..55,61..75,81..one-of(90,93)))",
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
for sub_loc in f.location.parts:
|
|
self.assertEqual(sub_loc.strand, +1)
|
|
tmp = f._flip(100)
|
|
self.assertEqual(tmp.location.strand, -1)
|
|
for sub_loc in tmp.location.parts:
|
|
self.assertEqual(sub_loc.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f1 = SeqFeature(SimpleLocation(BeforePosition(210), 220, strand=-1))
|
|
f2 = SeqFeature(
|
|
SimpleLocation(225, WithinPosition(244, left=240, right=244), strand=-1)
|
|
)
|
|
f = make_join_feature([f1, f2], "gene")
|
|
self.assertEqual(
|
|
_get_location_string(f, 500), "complement(join(<211..220,226..(240.244)))"
|
|
)
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(300), 300), "join((57.61)..75,81..>90)"
|
|
)
|
|
self.assertEqual(f.location.strand, -1)
|
|
for sub_loc in f.location.parts:
|
|
self.assertEqual(sub_loc.strand, -1)
|
|
tmp = f._flip(100)
|
|
self.assertEqual(tmp.location.strand, +1)
|
|
for sub_loc in tmp.location.parts:
|
|
self.assertEqual(sub_loc.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f1 = SeqFeature(SimpleLocation(AfterPosition(310), 320, strand=-1))
|
|
# Note - is one-of(340,337) allowed or should it be one-of(337,340)?
|
|
pos = OneOfPosition(340, [ExactPosition(340), ExactPosition(337)])
|
|
f2 = SeqFeature(SimpleLocation(325, pos, strand=-1))
|
|
f3 = SeqFeature(
|
|
SimpleLocation(345, WithinPosition(355, left=350, right=355), strand=-1)
|
|
)
|
|
f = make_join_feature([f1, f2, f3], "CDS")
|
|
self.assertEqual(
|
|
_get_location_string(f, 500),
|
|
"complement(join(>311..320,326..one-of(340,337),346..(350.355)))",
|
|
)
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(400), 400),
|
|
"join((46.51)..55,one-of(64,61)..75,81..<90)",
|
|
)
|
|
self.assertEqual(f.location.strand, -1)
|
|
tmp = f._flip(100)
|
|
self.assertEqual(tmp.location.strand, +1)
|
|
for sub_loc in tmp.location.parts:
|
|
self.assertEqual(sub_loc.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
self.write_read_checks()
|
|
|
|
def test_before(self):
|
|
"""Features: write/read simple before locations."""
|
|
s = "N" * 200
|
|
f = SeqFeature(SimpleLocation(BeforePosition(5), 10, strand=+1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "<6..10")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(11..>15)")
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(BeforePosition(15), BeforePosition(20), strand=+1),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "<16..<20")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(>1..>5)")
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(SimpleLocation(25, BeforePosition(30), strand=+1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "26..<30")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(40), 40), "complement(>11..15)")
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(SimpleLocation(BeforePosition(35), 40, strand=-1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(<36..40)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(40), 40), "1..>5")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(BeforePosition(45), BeforePosition(50), strand=-1),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "complement(<46..<50)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), ">51..>55")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(SimpleLocation(55, BeforePosition(60), strand=-1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(56..<60)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), ">41..45")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
self.write_read_checks()
|
|
|
|
def test_after(self):
|
|
"""Features: write/read simple after locations."""
|
|
s = "N" * 200
|
|
f = SeqFeature(SimpleLocation(AfterPosition(5), 10, strand=+1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), ">6..10")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "complement(91..<95)")
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(AfterPosition(15), AfterPosition(20), strand=+1),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), ">16..>20")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(20), 20), "complement(<1..<5)")
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(SimpleLocation(25, AfterPosition(30), strand=+1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "26..>30")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(30), 30), "complement(<1..5)")
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(SimpleLocation(AfterPosition(35), 40, strand=-1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(>36..40)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "61..<65")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(AfterPosition(45), AfterPosition(50), strand=-1),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "complement(>46..>50)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "<51..<55")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(SimpleLocation(55, AfterPosition(60), strand=-1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(56..>60)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "<41..45")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
self.write_read_checks()
|
|
|
|
def test_oneof(self):
|
|
"""Features: write/read simple one-of locations."""
|
|
s = "N" * 100
|
|
start = OneOfPosition(0, [ExactPosition(0), ExactPosition(3), ExactPosition(6)])
|
|
f = SeqFeature(SimpleLocation(start, 21, strand=+1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "one-of(1,4,7)..21")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(100), 100), "complement(80..one-of(94,97,100))"
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
start = OneOfPosition(10, [ExactPosition(x) for x in [10, 13, 16]])
|
|
end = OneOfPosition(50, [ExactPosition(x) for x in [41, 44, 50]])
|
|
f = SeqFeature(SimpleLocation(start, end, strand=+1), type="gene")
|
|
self.assertEqual(
|
|
_get_location_string(f, 100), "one-of(11,14,17)..one-of(41,44,50)"
|
|
)
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(50), 50),
|
|
"complement(one-of(1,7,10)..one-of(34,37,40))",
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
end = OneOfPosition(33, [ExactPosition(x) for x in [30, 33]])
|
|
f = SeqFeature(SimpleLocation(27, end, strand=+1), type="gene")
|
|
self.assertEqual(_get_location_string(f, 100), "28..one-of(30,33)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(40), 40), "complement(one-of(8,11)..13)"
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
start = OneOfPosition(36, [ExactPosition(x) for x in [36, 40]])
|
|
f = SeqFeature(SimpleLocation(start, 46, strand=-1), type="CDS")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(one-of(37,41)..46)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(50), 50), "5..one-of(10,14)")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
start = OneOfPosition(45, [ExactPosition(x) for x in [45, 60]])
|
|
end = OneOfPosition(90, [ExactPosition(x) for x in [70, 90]])
|
|
f = SeqFeature(SimpleLocation(start, end, strand=-1), type="CDS")
|
|
self.assertEqual(
|
|
_get_location_string(f, 100), "complement(one-of(46,61)..one-of(70,90))"
|
|
)
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(100), 100), "one-of(11,31)..one-of(40,55)"
|
|
)
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
end = OneOfPosition(63, [ExactPosition(x) for x in [60, 63]])
|
|
f = SeqFeature(SimpleLocation(55, end, strand=-1), type="tRNA")
|
|
self.assertEqual(_get_location_string(f, 100), "complement(56..one-of(60,63))")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "one-of(38,41)..45")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
self.write_read_checks()
|
|
|
|
def test_within(self):
|
|
"""Features: write/read simple within locations."""
|
|
s = "N" * 100
|
|
f = SeqFeature(
|
|
SimpleLocation(WithinPosition(2, left=2, right=8), 10, strand=+1),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "(3.9)..10")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(20), 20), "complement(11..(12.18))"
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(
|
|
WithinPosition(12, left=12, right=18),
|
|
WithinPosition(28, left=20, right=28),
|
|
strand=+1,
|
|
),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "(13.19)..(20.28)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(30), 30), "complement((3.11)..(12.18))"
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(25, WithinPosition(33, left=30, right=33), strand=+1),
|
|
type="misc_feature",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "26..(30.33)")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(
|
|
_get_location_string(f._flip(40), 40), "complement((8.11)..15)"
|
|
)
|
|
self.assertEqual(f.location.strand, +1)
|
|
self.assertEqual(f._flip(100).location.strand, -1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(WithinPosition(35, left=35, right=39), 40, strand=-1),
|
|
type="rRNA",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "complement((36.40)..40)")
|
|
self.assertEqual(_get_location_string(f._flip(40), 40), "1..(1.5)")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(
|
|
WithinPosition(45, left=45, right=47),
|
|
WithinPosition(53, left=50, right=53),
|
|
strand=-1,
|
|
),
|
|
type="repeat_region",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "complement((46.48)..(50.53))")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(60), 60), "(8.11)..(13.15)")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
f = SeqFeature(
|
|
SimpleLocation(55, WithinPosition(65, left=60, right=65), strand=-1),
|
|
type="CDS",
|
|
)
|
|
self.assertEqual(_get_location_string(f, 100), "complement(56..(60.65))")
|
|
self.assertEqual(len(f), len(f.extract(s)))
|
|
self.assertEqual(_get_location_string(f._flip(100), 100), "(36.41)..45")
|
|
self.assertEqual(f.location.strand, -1)
|
|
self.assertEqual(f._flip(100).location.strand, +1)
|
|
self.record.features.append(f)
|
|
|
|
self.write_read_checks()
|
|
|
|
def test_datetime(self):
|
|
default_locale = locale.setlocale(locale.LC_ALL)
|
|
for locale_name in [
|
|
"C",
|
|
"de_DE.utf8",
|
|
"fr_FR.utf8",
|
|
"pt_BR.utf8",
|
|
]:
|
|
try:
|
|
locale.setlocale(locale.LC_ALL, locale_name)
|
|
# Why october ? Because it is abbreviated to OKT in DE, OCT. in FR, OUT in BR
|
|
self.record.annotations["date"] = datetime.datetime.strptime(
|
|
"2025-10-01", "%Y-%m-%d"
|
|
)
|
|
self.write_read_check("gb")
|
|
except locale.Error:
|
|
pass
|
|
locale.setlocale(locale.LC_ALL, default_locale)
|
|
|
|
def test_date(self):
|
|
for m in [1, 10, 12]:
|
|
self.record.annotations["date"] = datetime.datetime.strptime(
|
|
f"2025-{m}-01", "%Y-%m-%d"
|
|
).date()
|
|
self.write_read_check("gb")
|
|
|
|
def test_invalid_date_format(self):
|
|
# Using a date in the wrong format
|
|
self.record.annotations["date"] = "04-04-1970"
|
|
stream = StringIO()
|
|
with warnings.catch_warnings(record=True) as w:
|
|
SeqIO.write([self.record], stream, "gb")
|
|
self.assertEqual(len(w), 1, "a warning should be raised")
|
|
self.assertIn("Invalid date format", str(w[0].message))
|
|
stream.seek(0)
|
|
self.assertIn("1980", stream.getvalue())
|
|
self.assertNotIn(self.record.annotations["date"], stream.getvalue())
|
|
|
|
def test_invalid_date_locale(self):
|
|
# Using a date not in english which is not accepted by the writer
|
|
self.record.annotations["date"] = "04-OKT-1970"
|
|
stream = StringIO()
|
|
with warnings.catch_warnings(record=True) as w:
|
|
SeqIO.write([self.record], stream, "gb")
|
|
self.assertEqual(len(w), 1, "a warning should be raised")
|
|
self.assertIn("Invalid date", str(w[0].message))
|
|
stream.seek(0)
|
|
self.assertIn("1980", stream.getvalue())
|
|
self.assertNotIn(self.record.annotations["date"], stream.getvalue())
|
|
|
|
|
|
class NC_000932(SeqIOFeatureTestBaseClass):
|
|
# This includes an evil dual strand gene
|
|
basename = "NC_000932"
|
|
emblname = None # "AP000423" has different annotation (e.g. more CDS)
|
|
table = 11
|
|
skip_trans_test = [
|
|
"gi|7525080|ref|NP_051037.1|", # dual-strand
|
|
"gi|7525057|ref|NP_051038.1|", # dual-strand
|
|
"gi|90110725|ref|NP_051109.2|", # Invalid annotation? No start codon
|
|
]
|
|
__doc__ = f"Tests using {basename} GenBank and FASTA files from the NCBI"
|
|
# TODO - neat way to change the docstrings...
|
|
|
|
def setUp(self):
|
|
self.gb_filename = os.path.join("GenBank", self.basename + ".gb")
|
|
self.ffn_filename = os.path.join("GenBank", self.basename + ".ffn")
|
|
self.faa_filename = os.path.join("GenBank", self.basename + ".faa")
|
|
self.fna_filename = os.path.join("GenBank", self.basename + ".fna")
|
|
if self.emblname:
|
|
self.embl_filename = os.path.join("EMBL", self.emblname + ".embl")
|
|
|
|
# These tests only need the GenBank file and the FAA file:
|
|
def test_CDS(self):
|
|
"""Checking GenBank CDS translations vs FASTA faa file."""
|
|
gb_record = SeqIO.read(self.gb_filename, "genbank")
|
|
gb_cds = list(SeqIO.parse(self.gb_filename, "genbank-cds"))
|
|
fasta = list(SeqIO.parse(self.faa_filename, "fasta"))
|
|
self.compare_records(gb_cds, fasta)
|
|
cds_features = [f for f in gb_record.features if f.type == "CDS"]
|
|
self.assertEqual(len(cds_features), len(fasta))
|
|
for f, r in zip(cds_features, fasta):
|
|
if r.id in self.skip_trans_test:
|
|
continue
|
|
# Get the nucleotides and translate them
|
|
nuc = f.extract(gb_record.seq)
|
|
self.assertEqual(len(nuc), len(f))
|
|
try:
|
|
pro = nuc.translate(table=self.table, cds=True)
|
|
except TranslationError as e:
|
|
msg = f"{e}\n{r.id!r}, {nuc!r}, {self.table!r}\n{f}"
|
|
self.fail(msg)
|
|
if pro[-1] == "*":
|
|
self.assertEqual(pro[:-1], r.seq)
|
|
else:
|
|
self.assertEqual(pro, r.seq)
|
|
|
|
|
|
class NC_005816(NC_000932):
|
|
basename = "NC_005816"
|
|
emblname = "AE017046"
|
|
table = 11
|
|
skip_trans_test = []
|
|
__doc__ = f"Tests using {basename} GenBank and FASTA files from the NCBI"
|
|
|
|
def test_GenBank_vs_EMBL(self):
|
|
if not self.emblname:
|
|
return
|
|
gb_record = SeqIO.read(self.gb_filename, "genbank")
|
|
embl_record = SeqIO.read(self.embl_filename, "embl")
|
|
if len(embl_record.features) < len(gb_record.features):
|
|
# Used to match, but I've update the GenBank files
|
|
# which now has lots of misc_feature entries not in EMBL
|
|
embl_record.features = [
|
|
f for f in embl_record.features if f.type != "misc_feature"
|
|
]
|
|
gb_record.features = [
|
|
f for f in gb_record.features if f.type != "misc_feature"
|
|
]
|
|
self.compare_record(gb_record, embl_record, expect_minor_diffs=True)
|
|
|
|
def test_Translations(self):
|
|
"""Checking translation of FASTA features (faa vs ffn)."""
|
|
faa_records = list(SeqIO.parse(self.faa_filename, "fasta"))
|
|
ffn_records = list(SeqIO.parse(self.ffn_filename, "fasta"))
|
|
self.assertEqual(len(faa_records), len(ffn_records))
|
|
for faa, fna in zip(faa_records, ffn_records):
|
|
translation = fna.seq.translate(self.table, cds=True)
|
|
if faa.id in self.skip_trans_test:
|
|
continue
|
|
t = SeqRecord(
|
|
translation, id="Translation", description=f"Table {self.table}"
|
|
)
|
|
msg = "FAA vs FNA translation problem:\n%s\n%s\n%s\n" % (
|
|
fna.format("fasta"),
|
|
t.format("fasta"),
|
|
faa.format("fasta"),
|
|
)
|
|
self.assertTrue(translation == faa.seq or translation != faa.seq + "*")
|
|
|
|
def test_Genome(self):
|
|
"""Checking GenBank sequence vs FASTA fna file."""
|
|
gb_record = SeqIO.read(self.gb_filename, "genbank")
|
|
fa_record = SeqIO.read(self.fna_filename, "fasta")
|
|
self.compare_record(gb_record, fa_record)
|
|
if self.emblname is None:
|
|
return
|
|
embl_record = SeqIO.read(self.embl_filename, "embl")
|
|
if len(embl_record.features) < len(gb_record.features):
|
|
# Hack since now out of sync for NC_005816
|
|
embl_record.features = [
|
|
f for f in embl_record.features if f.type != "misc_feature"
|
|
]
|
|
gb_record.features = [
|
|
f for f in gb_record.features if f.type != "misc_feature"
|
|
]
|
|
self.compare_record(gb_record, embl_record, expect_minor_diffs=True)
|
|
|
|
def test_Features(self):
|
|
"""Checking GenBank features sequences vs FASTA ffn file."""
|
|
gb_record = SeqIO.read(self.gb_filename, "genbank")
|
|
features = [f for f in gb_record.features if f.type == "CDS"]
|
|
fa_records = list(SeqIO.parse(self.ffn_filename, "fasta"))
|
|
self.assertEqual(len(fa_records), len(features))
|
|
# This assumes they are in the same order...
|
|
for fa_record, f in zip(fa_records, features):
|
|
# TODO - check the FASTA ID line against the coordinates?
|
|
f_seq = f.extract(gb_record.seq)
|
|
self.assertEqual(fa_record.seq, f_seq)
|
|
self.assertEqual(len(f_seq), len(f))
|
|
|
|
|
|
class TestWriteRead(SeqIOFeatureTestBaseClass):
|
|
"""Test can write and read back files."""
|
|
|
|
def write_read(self, filename, in_format="gb", out_formats=("gb", "embl", "imgt")):
|
|
for out_format in out_formats:
|
|
msg = f"Convert {filename} from {in_format} to {out_format}"
|
|
gb_records = list(SeqIO.parse(filename, in_format))
|
|
# Write it out...
|
|
handle = StringIO()
|
|
SeqIO.write(gb_records, handle, out_format)
|
|
handle.seek(0)
|
|
# Now load it back and check it agrees,
|
|
gb_records2 = list(SeqIO.parse(handle, out_format))
|
|
self.compare_records(gb_records, gb_records2, msg=msg)
|
|
|
|
def test_NC_000932(self):
|
|
"""Write and read back NC_000932.gb."""
|
|
self.write_read(os.path.join("GenBank", "NC_000932.gb"), "gb")
|
|
|
|
def test_NC_005816(self):
|
|
"""Write and read back NC_005816.gb."""
|
|
self.write_read(os.path.join("GenBank", "NC_005816.gb"), "gb")
|
|
|
|
def test_gbvrl1_start(self):
|
|
"""Write and read back gbvrl1_start.seq."""
|
|
self.write_read(os.path.join("GenBank", "gbvrl1_start.seq"), "gb")
|
|
|
|
def test_NT_019265(self):
|
|
"""Write and read back NT_019265.gb."""
|
|
self.write_read(os.path.join("GenBank", "NT_019265.gb"), "gb")
|
|
|
|
def test_cor6(self):
|
|
"""Write and read back cor6_6.gb."""
|
|
self.write_read(os.path.join("GenBank", "cor6_6.gb"), "gb")
|
|
|
|
def test_arab1(self):
|
|
"""Write and read back arab1.gb."""
|
|
self.write_read(os.path.join("GenBank", "arab1.gb"), "gb")
|
|
|
|
def test_one_of(self):
|
|
"""Write and read back of_one.gb."""
|
|
self.write_read(os.path.join("GenBank", "one_of.gb"), "gb")
|
|
|
|
def test_pri1(self):
|
|
"""Write and read back pri1.gb."""
|
|
self.write_read(os.path.join("GenBank", "pri1.gb"), "gb")
|
|
|
|
def test_noref(self):
|
|
"""Write and read back noref.gb."""
|
|
self.write_read(os.path.join("GenBank", "noref.gb"), "gb")
|
|
|
|
def test_origin_line(self):
|
|
"""Write and read back origin_line.gb."""
|
|
self.write_read(os.path.join("GenBank", "origin_line.gb"), "gb")
|
|
|
|
def test_dbsource_wrap(self):
|
|
"""Write and read back dbsource_wrap.gb."""
|
|
with warnings.catch_warnings():
|
|
# Ignore warning about over long DBSOURCE line
|
|
warnings.simplefilter("ignore", category=BiopythonWarning)
|
|
self.write_read(os.path.join("GenBank", "dbsource_wrap.gb"), "gb", ["gb"])
|
|
# Protein so can't convert this to EMBL format
|
|
|
|
def test_blank_seq(self):
|
|
"""Write and read back blank_seq.gb."""
|
|
self.write_read(os.path.join("GenBank", "blank_seq.gb"), "gb", ["gb"])
|
|
# Protein so can't convert this to EMBL format
|
|
|
|
def test_extra_keywords(self):
|
|
"""Write and read back extra_keywords.gb."""
|
|
self.write_read(os.path.join("GenBank", "extra_keywords.gb"), "gb")
|
|
|
|
def test_protein_refseq(self):
|
|
"""Write and read back protein_refseq.gb."""
|
|
self.write_read(os.path.join("GenBank", "protein_refseq.gb"), "gb", ["gb"])
|
|
# Protein so can't convert this to EMBL format
|
|
|
|
def test_protein_refseq2(self):
|
|
"""Write and read back protein_refseq2.gb."""
|
|
self.write_read(os.path.join("GenBank", "protein_refseq2.gb"), "gb", ["gb"])
|
|
# Protein so can't convert this to EMBL format
|
|
|
|
def test_AAA03323(self):
|
|
"""Write and read back AAA03323.embl."""
|
|
self.write_read(os.path.join("EMBL", "AAA03323.embl"), "embl")
|
|
|
|
def test_AE017046(self):
|
|
"""Write and read back AE017046.embl."""
|
|
self.write_read(os.path.join("EMBL", "AE017046.embl"), "embl")
|
|
|
|
def test_DD231055_edited(self):
|
|
"""Write and read back DD231055_edited.embl."""
|
|
self.write_read(os.path.join("EMBL", "DD231055_edited.embl"), "embl")
|
|
|
|
def test_Human_contigs(self):
|
|
"""Write and read back Human_contigs.embl."""
|
|
self.write_read(os.path.join("EMBL", "Human_contigs.embl"), "embl")
|
|
|
|
def test_SC10H5(self):
|
|
"""Write and read back SC10H5.embl."""
|
|
self.write_read(os.path.join("EMBL", "SC10H5.embl"), "embl")
|
|
|
|
def test_TRBG361(self):
|
|
"""Write and read back TRBG361.embl."""
|
|
self.write_read(os.path.join("EMBL", "TRBG361.embl"), "embl")
|
|
|
|
def test_U87107(self):
|
|
"""Write and read back U87107.embl."""
|
|
self.write_read(os.path.join("EMBL", "U87107.embl"), "embl")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
runner = unittest.TextTestRunner(verbosity=2)
|
|
unittest.main(testRunner=runner)
|