mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
allow to read and write nonstandard GC, GS, GR lines in Stockholm format (#4504)
* read and write nonstandard GC, GS, GR lines * no warning for ignored non-standard GF lines * add contributor info * add test file for nonstandard annotations in stockholm format
This commit is contained in:
@ -234,7 +234,10 @@ class AlignmentIterator(interfaces.AlignmentIterator):
|
||||
else:
|
||||
assert len(value) == 1, (key, value)
|
||||
value = value.pop()
|
||||
alignment.annotations[AlignmentIterator.gf_mapping[key]] = value
|
||||
try:
|
||||
alignment.annotations[AlignmentIterator.gf_mapping[key]] = value
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _store_per_column_annotations(alignment, gc, columns, skipped_columns):
|
||||
@ -251,7 +254,9 @@ class AlignmentIterator(interfaces.AlignmentIterator):
|
||||
raise ValueError(
|
||||
f"{key} length is {len(value)}, expected {columns}"
|
||||
)
|
||||
alignment.column_annotations[AlignmentIterator.gc_mapping[key]] = value
|
||||
alignment.column_annotations[
|
||||
AlignmentIterator.gc_mapping.get(key, key)
|
||||
] = value
|
||||
|
||||
@staticmethod
|
||||
def _store_per_sequence_annotations(alignment, gs):
|
||||
@ -267,7 +272,9 @@ class AlignmentIterator(interfaces.AlignmentIterator):
|
||||
elif key == "DR":
|
||||
record.dbxrefs = value
|
||||
else:
|
||||
record.annotations[AlignmentIterator.gs_mapping[key]] = value
|
||||
record.annotations[
|
||||
AlignmentIterator.gs_mapping.get(key, key)
|
||||
] = value
|
||||
|
||||
@staticmethod
|
||||
def _store_per_sequence_and_per_column_annotations(alignment, gr):
|
||||
@ -277,9 +284,9 @@ class AlignmentIterator(interfaces.AlignmentIterator):
|
||||
break
|
||||
else:
|
||||
raise ValueError(f"Failed to find seqname {seqname}")
|
||||
for keyword, letter_annotation in letter_annotations.items():
|
||||
feature = AlignmentIterator.gr_mapping[keyword]
|
||||
if keyword == "CSA":
|
||||
for key, letter_annotation in letter_annotations.items():
|
||||
feature = AlignmentIterator.gr_mapping.get(key, key)
|
||||
if key == "CSA":
|
||||
letter_annotation = letter_annotation.replace("-", "")
|
||||
else:
|
||||
letter_annotation = letter_annotation.replace(".", "")
|
||||
@ -533,7 +540,7 @@ class AlignmentWriter(interfaces.AlignmentWriter):
|
||||
for record in alignment.sequences:
|
||||
name = record.id.ljust(width)
|
||||
for key, value in record.annotations.items():
|
||||
feature = self.gs_mapping[key]
|
||||
feature = self.gs_mapping.get(key, key)
|
||||
lines.append(f"#=GS {name} {feature} {value}\n")
|
||||
if record.description:
|
||||
lines.append(f"#=GS {name} DE {record.description}\n")
|
||||
@ -558,8 +565,8 @@ class AlignmentWriter(interfaces.AlignmentWriter):
|
||||
# alignment.column_annotations
|
||||
if alignment.column_annotations:
|
||||
for key, value in alignment.column_annotations.items():
|
||||
feature = self.gc_mapping[key]
|
||||
line = f"#=GC {feature}".ljust(start) + value + "\n"
|
||||
feature = self.gc_mapping.get(key, key)
|
||||
line = f"#=GC {feature} ".ljust(start) + value + "\n"
|
||||
lines.append(line)
|
||||
lines.append("//\n")
|
||||
return "".join(lines)
|
||||
@ -592,7 +599,7 @@ class AlignmentWriter(interfaces.AlignmentWriter):
|
||||
indices.reverse()
|
||||
name = record.id.ljust(width)
|
||||
for key, value in record.letter_annotations.items():
|
||||
feature = AlignmentWriter.gr_mapping[key]
|
||||
feature = AlignmentWriter.gr_mapping.get(key, key)
|
||||
j = 0
|
||||
values = bytearray(b"." * len(aligned_sequence))
|
||||
for i, letter in enumerate(aligned_sequence):
|
||||
@ -600,7 +607,7 @@ class AlignmentWriter(interfaces.AlignmentWriter):
|
||||
values[i] = ord(value[j])
|
||||
j += 1
|
||||
value = values.decode()
|
||||
line = f"#=GR {name} {feature}".ljust(start) + value + "\n"
|
||||
line = f"#=GR {name} {feature} ".ljust(start) + value + "\n"
|
||||
yield line
|
||||
|
||||
|
||||
|
@ -320,6 +320,7 @@ please open an issue on GitHub or mention it on the mailing list.
|
||||
- Tiago Antao <https://github.com/tiagoantao>
|
||||
- Tianyi Shi <https://github.com/TianyiShi2001>
|
||||
- Tim Burke <https://github.com/tipabu>
|
||||
- Tom Eulenfeld <https://github.com/trichter>
|
||||
- Tommy Carstensen <https://github.com/tommycarstensen>
|
||||
- Tyghe Vallard <https://github.com/necrolyte2>
|
||||
- Uri Laserson <https://github.com/laserson>
|
||||
|
1
NEWS.rst
1
NEWS.rst
@ -120,6 +120,7 @@ possible, especially the following contributors:
|
||||
- Ricardas Ralys (first contribution)
|
||||
- Rob Miller
|
||||
- Thomas Holder
|
||||
- Tom Eulenfeld (first contribution)
|
||||
- Vladislav Kuznetsov (first contribution)
|
||||
- Wibowo Arindrarto
|
||||
- Yiming Qu (first contribution)
|
||||
|
39
Tests/Stockholm/example_nonstandardannotations.sth
Normal file
39
Tests/Stockholm/example_nonstandardannotations.sth
Normal file
@ -0,0 +1,39 @@
|
||||
# STOCKHOLM 1.0
|
||||
#=GF ID HAT
|
||||
#=GF AC PF02184.18
|
||||
#=GF DE HAT (Half-A-TPR) repeat
|
||||
#=GF AU SMART;
|
||||
#=GF SE Alignment kindly provided by SMART
|
||||
#=GF GA 21.00 21.00;
|
||||
#=GF TC 21.00 21.00;
|
||||
#=GF NC 20.90 20.90;
|
||||
#=GF BM hmmbuild HMM.ann SEED.ann
|
||||
#=GF SM hmmsearch -Z 57096847 -E 1000 --cpu 4 HMM pfamseq
|
||||
#=GF TP Repeat
|
||||
#=GF CL CL0020
|
||||
#=GF RN [1]
|
||||
#=GF RM 9478129
|
||||
#=GF RT The HAT helix, a repetitive motif implicated in RNA processing.
|
||||
#=GF RA Preker PJ, Keller W;
|
||||
#=GF RL Trends Biochem Sci 1998;23:15-16.
|
||||
#=GF DR INTERPRO; IPR003107;
|
||||
#=GF DR SMART; HAT;
|
||||
#=GF DR SO; 0001068; polypeptide_repeat;
|
||||
#=GF CC The HAT (Half A TPR) repeat is found in several RNA processing
|
||||
#=GF CC proteins [1].
|
||||
#=GF SQ 3
|
||||
#=GF nondefaultgf Nondefault GF lines are ignored in io
|
||||
#=GS CRN_DROME/191-222 AC P17886.2
|
||||
#=GS CRN_DROME/191-222 nonstandardgs 42
|
||||
#=GS CLF1_SCHPO/185-216 AC P87312.1
|
||||
#=GS CLF1_SCHPO/185-216 DR PDB; 3JB9 R; 185-216;
|
||||
#=GS O16376_CAEEL/201-233 AC O16376.2
|
||||
CRN_DROME/191-222 KEIDRAREIYERFVYVH.PDVKNWIKFARFEES
|
||||
#=GR CRN_DROME/191-222 nonstandardgr --------X.XXXXXXXX---------------
|
||||
CLF1_SCHPO/185-216 HENERARGIYERFVVVH.PEVTNWLRWARFEEE
|
||||
#=GR CLF1_SCHPO/185-216 SS --HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH
|
||||
O16376_CAEEL/201-233 KEIDRARSVYQRFLHVHGINVQNWIKYAKFEER
|
||||
#=GC SS_cons --HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH
|
||||
#=GC seq_cons KEIDRARuIYERFVaVH.P-VpNWIKaARFEEc
|
||||
#=GC nonstandardgc --------..........---------------
|
||||
//
|
@ -6,6 +6,7 @@
|
||||
import unittest
|
||||
from io import StringIO
|
||||
|
||||
|
||||
from Bio import Align
|
||||
|
||||
|
||||
@ -6688,6 +6689,25 @@ np.array([['V', 'E', 'R', 'Y', 'S', 'L', 'S', 'P', 'M', 'K', 'D', 'L', 'W',
|
||||
stream.close()
|
||||
self.check_alignment_cath3(alignment)
|
||||
|
||||
def test_io_nonstandard_annotations(self):
|
||||
"""Test input and output of nonstandard GC, GS and GR annotation lines."""
|
||||
# We write the alignment once to a stream and read it again to test
|
||||
# both inpiut and output.
|
||||
path = "Stockholm/example_nonstandardannotations.sth"
|
||||
alignments = Align.parse(path, "stockholm")
|
||||
alignment = next(alignments)
|
||||
self.assertNotIn("nonstandardgf", alignment.annotations.keys())
|
||||
stream = StringIO()
|
||||
Align.write(alignment, stream, "stockholm")
|
||||
stream.seek(0)
|
||||
alignments = Align.parse(stream, "stockholm")
|
||||
alignment = next(alignments)
|
||||
stream.close()
|
||||
self.assertIn("nonstandardgc", alignment.column_annotations.keys())
|
||||
self.assertIn("nonstandardgs", alignment.sequences[0].annotations.keys())
|
||||
self.assertIn("nonstandardgr", alignment.sequences[0].letter_annotations.keys())
|
||||
self.assertNotIn("nonstandardgf", alignment.annotations.keys())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = unittest.TextTestRunner(verbosity=2)
|
||||
|
Reference in New Issue
Block a user