allow to read and write nonstandard GC, GS, GR lines in Stockholm format (#4504)

* read and write nonstandard GC, GS, GR lines
* no warning for ignored non-standard GF lines
* add contributor info
* add test file for nonstandard annotations in stockholm format
This commit is contained in:
Tom Eulenfeld
2023-12-16 21:06:59 +01:00
committed by GitHub
parent 2bbc5f14d7
commit 4ddf78a738
5 changed files with 79 additions and 11 deletions

View File

@ -234,7 +234,10 @@ class AlignmentIterator(interfaces.AlignmentIterator):
else:
assert len(value) == 1, (key, value)
value = value.pop()
alignment.annotations[AlignmentIterator.gf_mapping[key]] = value
try:
alignment.annotations[AlignmentIterator.gf_mapping[key]] = value
except KeyError:
pass
@staticmethod
def _store_per_column_annotations(alignment, gc, columns, skipped_columns):
@ -251,7 +254,9 @@ class AlignmentIterator(interfaces.AlignmentIterator):
raise ValueError(
f"{key} length is {len(value)}, expected {columns}"
)
alignment.column_annotations[AlignmentIterator.gc_mapping[key]] = value
alignment.column_annotations[
AlignmentIterator.gc_mapping.get(key, key)
] = value
@staticmethod
def _store_per_sequence_annotations(alignment, gs):
@ -267,7 +272,9 @@ class AlignmentIterator(interfaces.AlignmentIterator):
elif key == "DR":
record.dbxrefs = value
else:
record.annotations[AlignmentIterator.gs_mapping[key]] = value
record.annotations[
AlignmentIterator.gs_mapping.get(key, key)
] = value
@staticmethod
def _store_per_sequence_and_per_column_annotations(alignment, gr):
@ -277,9 +284,9 @@ class AlignmentIterator(interfaces.AlignmentIterator):
break
else:
raise ValueError(f"Failed to find seqname {seqname}")
for keyword, letter_annotation in letter_annotations.items():
feature = AlignmentIterator.gr_mapping[keyword]
if keyword == "CSA":
for key, letter_annotation in letter_annotations.items():
feature = AlignmentIterator.gr_mapping.get(key, key)
if key == "CSA":
letter_annotation = letter_annotation.replace("-", "")
else:
letter_annotation = letter_annotation.replace(".", "")
@ -533,7 +540,7 @@ class AlignmentWriter(interfaces.AlignmentWriter):
for record in alignment.sequences:
name = record.id.ljust(width)
for key, value in record.annotations.items():
feature = self.gs_mapping[key]
feature = self.gs_mapping.get(key, key)
lines.append(f"#=GS {name} {feature} {value}\n")
if record.description:
lines.append(f"#=GS {name} DE {record.description}\n")
@ -558,8 +565,8 @@ class AlignmentWriter(interfaces.AlignmentWriter):
# alignment.column_annotations
if alignment.column_annotations:
for key, value in alignment.column_annotations.items():
feature = self.gc_mapping[key]
line = f"#=GC {feature}".ljust(start) + value + "\n"
feature = self.gc_mapping.get(key, key)
line = f"#=GC {feature} ".ljust(start) + value + "\n"
lines.append(line)
lines.append("//\n")
return "".join(lines)
@ -592,7 +599,7 @@ class AlignmentWriter(interfaces.AlignmentWriter):
indices.reverse()
name = record.id.ljust(width)
for key, value in record.letter_annotations.items():
feature = AlignmentWriter.gr_mapping[key]
feature = AlignmentWriter.gr_mapping.get(key, key)
j = 0
values = bytearray(b"." * len(aligned_sequence))
for i, letter in enumerate(aligned_sequence):
@ -600,7 +607,7 @@ class AlignmentWriter(interfaces.AlignmentWriter):
values[i] = ord(value[j])
j += 1
value = values.decode()
line = f"#=GR {name} {feature}".ljust(start) + value + "\n"
line = f"#=GR {name} {feature} ".ljust(start) + value + "\n"
yield line

View File

@ -320,6 +320,7 @@ please open an issue on GitHub or mention it on the mailing list.
- Tiago Antao <https://github.com/tiagoantao>
- Tianyi Shi <https://github.com/TianyiShi2001>
- Tim Burke <https://github.com/tipabu>
- Tom Eulenfeld <https://github.com/trichter>
- Tommy Carstensen <https://github.com/tommycarstensen>
- Tyghe Vallard <https://github.com/necrolyte2>
- Uri Laserson <https://github.com/laserson>

View File

@ -120,6 +120,7 @@ possible, especially the following contributors:
- Ricardas Ralys (first contribution)
- Rob Miller
- Thomas Holder
- Tom Eulenfeld (first contribution)
- Vladislav Kuznetsov (first contribution)
- Wibowo Arindrarto
- Yiming Qu (first contribution)

View File

@ -0,0 +1,39 @@
# STOCKHOLM 1.0
#=GF ID HAT
#=GF AC PF02184.18
#=GF DE HAT (Half-A-TPR) repeat
#=GF AU SMART;
#=GF SE Alignment kindly provided by SMART
#=GF GA 21.00 21.00;
#=GF TC 21.00 21.00;
#=GF NC 20.90 20.90;
#=GF BM hmmbuild HMM.ann SEED.ann
#=GF SM hmmsearch -Z 57096847 -E 1000 --cpu 4 HMM pfamseq
#=GF TP Repeat
#=GF CL CL0020
#=GF RN [1]
#=GF RM 9478129
#=GF RT The HAT helix, a repetitive motif implicated in RNA processing.
#=GF RA Preker PJ, Keller W;
#=GF RL Trends Biochem Sci 1998;23:15-16.
#=GF DR INTERPRO; IPR003107;
#=GF DR SMART; HAT;
#=GF DR SO; 0001068; polypeptide_repeat;
#=GF CC The HAT (Half A TPR) repeat is found in several RNA processing
#=GF CC proteins [1].
#=GF SQ 3
#=GF nondefaultgf Nondefault GF lines are ignored in io
#=GS CRN_DROME/191-222 AC P17886.2
#=GS CRN_DROME/191-222 nonstandardgs 42
#=GS CLF1_SCHPO/185-216 AC P87312.1
#=GS CLF1_SCHPO/185-216 DR PDB; 3JB9 R; 185-216;
#=GS O16376_CAEEL/201-233 AC O16376.2
CRN_DROME/191-222 KEIDRAREIYERFVYVH.PDVKNWIKFARFEES
#=GR CRN_DROME/191-222 nonstandardgr --------X.XXXXXXXX---------------
CLF1_SCHPO/185-216 HENERARGIYERFVVVH.PEVTNWLRWARFEEE
#=GR CLF1_SCHPO/185-216 SS --HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH
O16376_CAEEL/201-233 KEIDRARSVYQRFLHVHGINVQNWIKYAKFEER
#=GC SS_cons --HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH
#=GC seq_cons KEIDRARuIYERFVaVH.P-VpNWIKaARFEEc
#=GC nonstandardgc --------..........---------------
//

View File

@ -6,6 +6,7 @@
import unittest
from io import StringIO
from Bio import Align
@ -6688,6 +6689,25 @@ np.array([['V', 'E', 'R', 'Y', 'S', 'L', 'S', 'P', 'M', 'K', 'D', 'L', 'W',
stream.close()
self.check_alignment_cath3(alignment)
def test_io_nonstandard_annotations(self):
"""Test input and output of nonstandard GC, GS and GR annotation lines."""
# We write the alignment once to a stream and read it again to test
# both inpiut and output.
path = "Stockholm/example_nonstandardannotations.sth"
alignments = Align.parse(path, "stockholm")
alignment = next(alignments)
self.assertNotIn("nonstandardgf", alignment.annotations.keys())
stream = StringIO()
Align.write(alignment, stream, "stockholm")
stream.seek(0)
alignments = Align.parse(stream, "stockholm")
alignment = next(alignments)
stream.close()
self.assertIn("nonstandardgc", alignment.column_annotations.keys())
self.assertIn("nonstandardgs", alignment.sequences[0].annotations.keys())
self.assertIn("nonstandardgr", alignment.sequences[0].letter_annotations.keys())
self.assertNotIn("nonstandardgf", alignment.annotations.keys())
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)