mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
$ pyupgrade --keep-percent-format --py39-plus \ Bio/*.py Bio/*/*.py BioSQL/*.py Didn't find any changes in tests, scripts, or docs. Followed by removing a few now redundant imports and black in one case.
603 lines
26 KiB
Python
603 lines
26 KiB
Python
# Copyright 2011 by Wibowo Arindrarto (w.arindrarto@gmail.com)
|
|
# Revisions copyright 2011-2016 by Peter Cock.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Bio.SeqIO parser for the ABI format.
|
|
|
|
ABI is the format used by Applied Biosystem's sequencing machines to store
|
|
sequencing results.
|
|
|
|
For more details on the format specification, visit:
|
|
http://www6.appliedbiosystems.com/support/software_community/ABIF_File_Format.pdf
|
|
|
|
"""
|
|
|
|
import datetime
|
|
import struct
|
|
import sys
|
|
from os.path import basename
|
|
from typing import List
|
|
|
|
from Bio.Seq import Seq
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
from .Interfaces import SequenceIterator
|
|
|
|
# dictionary for determining which tags goes into SeqRecord annotation
|
|
# each key is tag_name + tag_number
|
|
# if a tag entry needs to be added, just add its key and its key
|
|
# for the annotations dictionary as the value
|
|
# dictionary for tags that require preprocessing before use in creating
|
|
# seqrecords
|
|
_EXTRACT = {
|
|
"TUBE1": "sample_well",
|
|
"DySN1": "dye",
|
|
"GTyp1": "polymer",
|
|
"MODL1": "machine_model",
|
|
}
|
|
|
|
|
|
# Complete data structure representing 98% of the API. The general section
|
|
# represents the part of the API that's common to ALL instruments, whereas the
|
|
# instrument specific sections are labelled as they are in the ABIF spec
|
|
#
|
|
# Keys don't seem to clash from machine to machine, so when we parse, we look
|
|
# for ANY key, and store that in the raw ABIF data structure attached to the
|
|
# annotations, with the assumption that anyone parsing the data can look up the
|
|
# spec themself
|
|
#
|
|
# Key definitions are retained in case end users want "nice" labels pre-made
|
|
# for them for all of the available fields.
|
|
_INSTRUMENT_SPECIFIC_TAGS = {}
|
|
|
|
# fmt: off
|
|
_INSTRUMENT_SPECIFIC_TAGS["general"] = {
|
|
"APFN2": "Sequencing Analysis parameters file name",
|
|
"APXV1": "Analysis Protocol XML schema version",
|
|
"APrN1": "Analysis Protocol settings name",
|
|
"APrV1": "Analysis Protocol settings version",
|
|
"APrX1": "Analysis Protocol XML string",
|
|
"CMNT1": "Sample Comment",
|
|
"CTID1": "Container Identifier, a.k.a. plate barcode",
|
|
"CTNM1": "Container name, usually identical to CTID, but not necessarily so",
|
|
"CTTL1": "Comment Title",
|
|
"CpEP1": "Capillary type electrophoresis. 1 for a capillary based machine. 0 for a slab gel based machine.",
|
|
"DATA1": "Channel 1 raw data",
|
|
"DATA2": "Channel 2 raw data",
|
|
"DATA3": "Channel 3 raw data",
|
|
"DATA4": "Channel 4 raw data",
|
|
"DATA5": "Short Array holding measured volts/10 (EP voltage) during run",
|
|
"DATA6": "Short Array holding measured milliAmps trace (EP current) during run",
|
|
"DATA7": "Short Array holding measured milliWatts trace (Laser EP Power) during run",
|
|
"DATA8": "Short Array holding measured oven Temperature (polymer temperature) trace during run",
|
|
"DATA9": "Channel 9 processed data",
|
|
"DATA10": "Channel 10 processed data",
|
|
"DATA11": "Channel 11 processed data",
|
|
"DATA12": "Channel 12 processed data",
|
|
# Prism 3100/3100-Avant may provide DATA105
|
|
# 3130/3130-XL may provide DATA105
|
|
# 3530/3530-XL may provide DATA105-199, 9-12, 205-299
|
|
"DSam1": "Downsampling factor",
|
|
"DySN1": "Dye set name",
|
|
"Dye#1": "Number of dyes",
|
|
"DyeN1": "Dye 1 name",
|
|
"DyeN2": "Dye 2 name",
|
|
"DyeN3": "Dye 3 name",
|
|
"DyeN4": "Dye 4 name",
|
|
"DyeW1": "Dye 1 wavelength",
|
|
"DyeW2": "Dye 2 wavelength",
|
|
"DyeW3": "Dye 3 wavelength",
|
|
"DyeW4": "Dye 4 wavelength",
|
|
# 'DyeN5-N': 'Dye 5-N Name',
|
|
# 'DyeW5-N': 'Dye 5-N Wavelength',
|
|
"EPVt1": "Electrophoresis voltage setting (volts)",
|
|
"EVNT1": "Start Run event",
|
|
"EVNT2": "Stop Run event",
|
|
"EVNT3": "Start Collection event",
|
|
"EVNT4": "Stop Collection event",
|
|
"FWO_1": 'Base Order. Sequencing Analysis Filter wheel order. Fixed for 3500 at "GATC"',
|
|
"GTyp1": "Gel or polymer Type",
|
|
"InSc1": "Injection time (seconds)",
|
|
"InVt1": "Injection voltage (volts)",
|
|
"LANE1": "Lane/Capillary",
|
|
"LIMS1": "Sample tracking ID",
|
|
"LNTD1": "Length to detector",
|
|
"LsrP1": "Laser Power setting (micro Watts)",
|
|
"MCHN1": "Instrument name and serial number",
|
|
"MODF1": "Data collection module file",
|
|
"MODL1": "Model number",
|
|
"NAVG1": "Pixels averaged per lane",
|
|
"NLNE1": "Number of capillaries",
|
|
"OfSc1": "List of scans that are marked off scale in Collection. (optional)",
|
|
# OvrI and OrvV are listed as "1-N", and "One for each dye (unanalyzed
|
|
# and/or analyzed data)"
|
|
"OvrI1": "List of scan number indexes that have values greater than 32767 but did not "
|
|
"saturate the camera. In Genemapper samples, this can have indexes with "
|
|
"values greater than 32000. In sequencing samples, this cannot have "
|
|
"indexes with values greater than 32000.",
|
|
"OvrI2": "List of scan number indexes that have values greater than 32767 but did not "
|
|
"saturate the camera. In Genemapper samples, this can have indexes with "
|
|
"values greater than 32000. In sequencing samples, this cannot have "
|
|
"indexes with values greater than 32000.",
|
|
"OvrI3": "List of scan number indexes that have values greater than 32767 but did not "
|
|
"saturate the camera. In Genemapper samples, this can have indexes with "
|
|
"values greater than 32000. In sequencing samples, this cannot have "
|
|
"indexes with values greater than 32000.",
|
|
"OvrI4": "List of scan number indexes that have values greater than 32767 but did not "
|
|
"saturate the camera. In Genemapper samples, this can have indexes with "
|
|
"values greater than 32000. In sequencing samples, this cannot have "
|
|
"indexes with values greater than 32000.",
|
|
"OvrV1": "List of color data values found at the locations listed in the OvrI tag. "
|
|
"There must be exactly as many numbers in this array as in the OvrI array.",
|
|
"OvrV2": "List of color data values found at the locations listed in the OvrI tag. "
|
|
"There must be exactly as many numbers in this array as in the OvrI array.",
|
|
"OvrV3": "List of color data values found at the locations listed in the OvrI tag. "
|
|
"There must be exactly as many numbers in this array as in the OvrI array.",
|
|
"OvrV4": "List of color data values found at the locations listed in the OvrI tag. "
|
|
"There must be exactly as many numbers in this array as in the OvrI array.",
|
|
"PDMF1": "Sequencing Analysis Mobility file name chosen in collection",
|
|
"RMXV1": "Run Module XML schema version",
|
|
"RMdN1": "Run Module name (same as MODF)",
|
|
"RMdX1": "Run Module XML string",
|
|
"RPrN1": "Run Protocol name",
|
|
"RPrV1": "Run Protocol version",
|
|
"RUND1": "Run Started Date",
|
|
"RUND2": "Run Stopped Date",
|
|
"RUND3": "Data Collection Started Date",
|
|
"RUND4": "Data Collection Stopped date",
|
|
"RUNT1": "Run Started Time",
|
|
"RUNT2": "Run Stopped Time",
|
|
"RUNT3": "Data Collection Started Time",
|
|
"RUNT4": "Data Collection Stopped Time",
|
|
"Rate1": "Scanning Rate. Milliseconds per frame.",
|
|
"RunN1": "Run Name",
|
|
"SCAN1": "Number of scans",
|
|
"SMED1": "Polymer lot expiration date",
|
|
"SMLt1": "Polymer lot number",
|
|
"SMPL1": "Sample name",
|
|
"SVER1": "Data collection software version",
|
|
"SVER3": "Data collection firmware version",
|
|
"Satd1": "Array of longs representing the scan numbers of data points, which are flagged as saturated by data collection (optional)",
|
|
"Scal1": "Rescaling divisor for color data",
|
|
"Scan1": "Number of scans (legacy - use SCAN)",
|
|
"TUBE1": "Well ID",
|
|
"Tmpr1": "Run temperature setting",
|
|
"User1": "Name of user who created the plate (optional)",
|
|
}
|
|
|
|
# No instrument specific tags
|
|
# _INSTRUMENT_SPECIFIC_TAGS['abi_prism_3100/3100-Avant'] = {
|
|
# }
|
|
|
|
_INSTRUMENT_SPECIFIC_TAGS["abi_3130/3130xl"] = {
|
|
"CTOw1": "Container owner",
|
|
"HCFG1": "Instrument Class",
|
|
"HCFG2": "Instrument Family",
|
|
"HCFG3": "Official Instrument Name",
|
|
"HCFG4": "Instrument Parameters",
|
|
"RMdVa1": "Run Module version",
|
|
}
|
|
|
|
_INSTRUMENT_SPECIFIC_TAGS["abi_3530/3530xl"] = {
|
|
"AAct1": "Primary Analysis Audit Active indication. True if system auditing was enabled during the last write of this file, "
|
|
"false if system auditing was disabled.",
|
|
"ABED1": "Anode buffer expiration date using ISO 8601 format using the patterns YYYY-MM-DDTHH:MM:SS.ss+/-HH:MM. Hundredths of a second are optional.",
|
|
"ABID1": "Anode buffer tray first installed date",
|
|
"ABLt1": "Anode buffer lot number",
|
|
"ABRn1": "Number of runs (injections) processed with the current Anode Buffer (runs allowed - runs remaining)",
|
|
"ABTp1": "Anode buffer type",
|
|
"AEPt1": "Analysis Ending scan number for basecalling on initial analysis",
|
|
"AEPt2": "Analysis Ending scan number for basecalling on last analysis",
|
|
"APCN1": "Amplicon name",
|
|
"ARTN1": "Analysis Return code. Produced only by 5 Prime basecaller 1.0b3",
|
|
"ASPF1": "Flag to indicate whether adaptive processing worked or not",
|
|
"ASPt1": "Analysis Starting scan number for first analysis",
|
|
"ASPt2": "Analysis Starting scan number for last analysis",
|
|
"AUDT2": "Audit log used across 3500 software (optional)",
|
|
"AVld1": "Assay validation flag (true or false)",
|
|
"AmbT1": "Record of ambient temperature readings",
|
|
"AsyC1": "The assay contents (xml format)",
|
|
"AsyN1": "The assay name",
|
|
"AsyV1": "The assay version",
|
|
"B1Pt1": "Reference scan number for mobility and spacing curves for first analysis",
|
|
"B1Pt2": "Reference scan number for mobility and spacing curves for last analysis",
|
|
"BCTS1": "Basecaller timestamp. Time of completion of most recent analysis",
|
|
"BcRn1": "Basecalling qc code",
|
|
"BcRs1": "Basecalling warnings, a concatenated comma separated string",
|
|
"BcRs2": "Basecalling errors, a concatenated comma separated string",
|
|
"CAED1": "Capillary array expiration",
|
|
"CALt1": "Capillary array lot number",
|
|
"CARn1": "Number of injections processed (including the one of which this sample was a part) through the capillary array",
|
|
"CASN1": "Capillary array serial number",
|
|
"CBED1": "Cathode buffer expiration date",
|
|
"CBID1": "Cathode buffer tray first installed date",
|
|
"CBLt1": "Cathode buffer lot number",
|
|
"CBRn1": "Number of runs (injections) processed with the current Cathode Buffer (runs allowed - runs remaining)",
|
|
"CBTp1": "Cathode buffer type",
|
|
"CLRG1": "Start of the clear range (inclusive).",
|
|
"CLRG2": "Clear range length",
|
|
"CRLn1": "Contiguous read length",
|
|
"CRLn2": 'One of "Pass", "Fail", or "Check"',
|
|
"CTOw1": "The name entered as the Owner of a plate, in the plate editor",
|
|
"CkSm1": "File checksum",
|
|
"DCEv1": "A list of door-close events, separated by semicolon. Door open events are generally paired with door close events.",
|
|
"DCHT1": "Reserved for backward compatibility. The detection cell heater temperature setting from the Run Module. Not used for 3500.",
|
|
"DOEv1": "A list of door-open events, separated by semicolon. Door close events are generally paired with door open events.",
|
|
"ESig2": "Electronic signature record used across 3500 software",
|
|
"FTab1": "Feature table. Can be created by Nibbler for Clear Range.",
|
|
"FVoc1": "Feature table vocabulary. Can be created by Nibbler for Clear Range.",
|
|
"Feat1": "Features. Can be created by Nibbler for Clear Range.",
|
|
"HCFG1": "The Instrument Class. All upper case, no spaces. Initial valid value: CE",
|
|
"HCFG2": "The Instrument Family. All upper case, no spaces. Valid values: 31XX or 37XX for UDC, 35XX (for 3500)",
|
|
"HCFG3": "The official instrument name. Mixed case, minus any special formatting. Initial valid values: 3130, 3130xl, 3730, 3730xl, 3500, 3500xl.",
|
|
"HCFG4": "Instrument parameters. Contains key-value pairs of instrument configuration information, separated by semicolons. "
|
|
"Four parameters are included initially: UnitID=<UNITD number>, CPUBoard=<board type>, "
|
|
"ArraySize=<# of capillaries>, SerialNumber=<Instrument Serial#>.",
|
|
"InjN1": "Injection name",
|
|
"LAST1": "Parameter settings information",
|
|
"NOIS1": "The estimate of rms baseline noise (S/N ratio) for each dye for a successfully analyzed sample. "
|
|
"Corresponds in order to the raw data in tags DATA 1-4. KB basecaller only.",
|
|
"P1AM1": "Amplitude of primary peak, which is not necessarily equal to corresponding signal strength at that position",
|
|
"P1RL1": "Deviation of primary peak position from (PLoc,2), times 100, rounded to integer",
|
|
"P1WD1": "Full-width Half-max of primary peak, times 100, rounded to integer. "
|
|
"Corresponding signal intensity is not necessarily equal to one half of primary peak amplitude",
|
|
"P2AM1": "Amplitude of secondary peak, which is not necessarily equal to corresponding signal strength at that position",
|
|
"P2BA1": "Base of secondary peak",
|
|
"P2RL1": "Deviation of secondary peak position from (PLoc,2), times 100, rounded to integer",
|
|
"PBAS1": "Array of sequence characters edited by user",
|
|
"PBAS2": "Array of sequence characters as called by Basecaller",
|
|
"PCON1": "Array of quality Values (0-255) as edited by user",
|
|
"PCON2": "Array of quality values (0-255) as called by Basecaller",
|
|
"PDMF2": "Mobility file name chosen in most recent analysis (identical to PDMF1)",
|
|
"PLOC1": "Array of peak locations edited by user",
|
|
"PLOC2": "Array of peak locations as called by Basecaller",
|
|
"PRJT1": "SeqScape 2.0 project template name",
|
|
"PROJ4": "SeqScape 2.0 project name",
|
|
"PSZE1": "Plate size. The number of sample positions in the container. Current allowed values: 96, 384.",
|
|
"PTYP1": "Plate type. Current allowed values: 96-Well, 384-Well.",
|
|
"PuSc1": "Median pupscore",
|
|
"QV201": "QV20+ value",
|
|
"QV202": 'One of "Pass", "Fail", or "Check"',
|
|
"QcPa1": "QC parameters",
|
|
"QcRn1": "Trimming and QC code",
|
|
"QcRs1": "QC warnings, a concatenated comma separated string",
|
|
"QcRs2": "QC errors, a concatenated comma separated string",
|
|
"RGOw1": "The name entered as the Owner of a Results Group, in the Results Group Editor. Implemented as the user name from the results group.",
|
|
"RInj1": "Reinjection number. The reinjection number that this sample belongs to. Not present if there was no reinjection.",
|
|
"RNmF1": "Raman normalization factor",
|
|
"RevC1": "for whether the sequence has been complemented",
|
|
"RunN1": "Run name (which, for 3500, is different from injection name)",
|
|
"S/N%1": "Signal strength for each dye",
|
|
"SMID1": "Polymer first installed date",
|
|
"SMRn1": "Number of runs (injections) processed with the current polymer (runs allowed - runs remaining)",
|
|
"SPAC1": "Average peak spacing used in last analysis",
|
|
"SPAC2": "Basecaller name - corresponds to name of bcp file.",
|
|
"SPAC3": "Average peak spacing last calculated by the Basecaller.",
|
|
"SPEC1": "Sequencing Analysis Specimen Name",
|
|
"SVER2": "Basecaller version number",
|
|
"SVER4": "Sample File Format Version String",
|
|
"ScPa1": "The parameter string of size caller",
|
|
"ScSt1": "Raw data start point. Set to 0 for 3500 data collection.",
|
|
"SpeN1": "Active spectral calibration name",
|
|
"TrPa1": "Trimming parameters",
|
|
"TrSc1": "Trace score.",
|
|
"TrSc2": 'One of "Pass", "Fail", or "Check"',
|
|
"phAR1": "Trace peak aria ratio",
|
|
"phCH1": 'Chemistry type ("term", "prim", "unknown"), based on DYE_1 information',
|
|
"phDY1": 'Dye ("big", "d-rhod", "unknown"), based on mob file information',
|
|
"phQL1": "Maximum Quality Value",
|
|
"phTR1": "Set Trim region",
|
|
"phTR2": "Trim probability",
|
|
}
|
|
|
|
_INSTRUMENT_SPECIFIC_TAGS["abi_3730/3730xl"] = {
|
|
"BufT1": "Buffer tray heater temperature (degrees C)",
|
|
}
|
|
# fmt: on
|
|
|
|
# dictionary for data unpacking format
|
|
_BYTEFMT = {
|
|
1: "b", # byte
|
|
2: "s", # char
|
|
3: "H", # word
|
|
4: "h", # short
|
|
5: "i", # long
|
|
6: "2i", # rational, legacy unsupported
|
|
7: "f", # float
|
|
8: "d", # double
|
|
10: "h2B", # date
|
|
11: "4B", # time
|
|
12: "2i2b", # thumb
|
|
13: "B", # bool
|
|
14: "2h", # point, legacy unsupported
|
|
15: "4h", # rect, legacy unsupported
|
|
16: "2i", # vPoint, legacy unsupported
|
|
17: "4i", # vRect, legacy unsupported
|
|
18: "s", # pString
|
|
19: "s", # cString
|
|
20: "2i", # tag, legacy unsupported
|
|
}
|
|
# header data structure (excluding 4 byte ABIF marker)
|
|
_HEADFMT = ">H4sI2H3I"
|
|
# directory data structure
|
|
_DIRFMT = ">4sI2H4I"
|
|
|
|
__global_tag_listing: list[str] = []
|
|
for tag in _INSTRUMENT_SPECIFIC_TAGS.values():
|
|
__global_tag_listing += tag.keys()
|
|
|
|
|
|
def _get_string_tag(opt_bytes_value, default=None):
|
|
"""Return the string value of the given an optional raw bytes tag value.
|
|
|
|
If the bytes value is None, return the given default value.
|
|
|
|
"""
|
|
if opt_bytes_value is None:
|
|
return default
|
|
try:
|
|
return opt_bytes_value.decode()
|
|
except UnicodeDecodeError:
|
|
return opt_bytes_value.decode(encoding=sys.getdefaultencoding())
|
|
|
|
|
|
class AbiIterator(SequenceIterator):
|
|
"""Parser for Abi files."""
|
|
|
|
def __init__(self, source, trim=False):
|
|
"""Return an iterator for the Abi file format."""
|
|
self.trim = trim
|
|
super().__init__(source, mode="b", fmt="ABI")
|
|
|
|
def parse(self, handle):
|
|
"""Start parsing the file, and return a SeqRecord generator."""
|
|
# check if input file is a valid Abi file
|
|
marker = handle.read(4)
|
|
if not marker:
|
|
# handle empty file gracefully
|
|
raise ValueError("Empty file.")
|
|
|
|
if marker != b"ABIF":
|
|
raise OSError(f"File should start ABIF, not {marker!r}")
|
|
records = self.iterate(handle)
|
|
return records
|
|
|
|
def iterate(self, handle):
|
|
"""Parse the file and generate SeqRecord objects."""
|
|
# dirty hack for handling time information
|
|
times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}
|
|
|
|
# initialize annotations
|
|
annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))
|
|
|
|
# parse header and extract data from directories
|
|
header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))
|
|
|
|
# Set default sample ID value, which we expect to be present in most
|
|
# cases in the SMPL1 tag, but may be missing.
|
|
sample_id = "<unknown id>"
|
|
|
|
raw = {}
|
|
seq = qual = None
|
|
for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
|
|
key = tag_name + str(tag_number)
|
|
|
|
raw[key] = tag_data
|
|
|
|
# PBAS2 is base-called sequence, only available in 3530
|
|
if key == "PBAS2":
|
|
seq = tag_data.decode()
|
|
# PCON2 is quality values of base-called sequence
|
|
elif key == "PCON2":
|
|
qual = [ord(val) for val in tag_data.decode()]
|
|
# SMPL1 is sample id entered before sequencing run, it must be
|
|
# a string.
|
|
elif key == "SMPL1":
|
|
sample_id = _get_string_tag(tag_data)
|
|
elif key in times:
|
|
times[key] = tag_data
|
|
else:
|
|
if key in _EXTRACT:
|
|
annot[_EXTRACT[key]] = tag_data
|
|
|
|
# set time annotations
|
|
annot["run_start"] = f"{times['RUND1']} {times['RUNT1']}"
|
|
annot["run_finish"] = f"{times['RUND2']} {times['RUNT2']}"
|
|
|
|
# raw data (for advanced end users benefit)
|
|
annot["abif_raw"] = raw
|
|
|
|
# fsa check
|
|
is_fsa_file = all(tn not in raw for tn in ("PBAS1", "PBAS2"))
|
|
|
|
if is_fsa_file:
|
|
try:
|
|
file_name = basename(handle.name).replace(".fsa", "")
|
|
except AttributeError:
|
|
file_name = ""
|
|
|
|
sample_id = _get_string_tag(raw.get("LIMS1"), sample_id)
|
|
description = _get_string_tag(raw.get("CTID1"), "<unknown description>")
|
|
record = SeqRecord(
|
|
Seq(""),
|
|
id=sample_id,
|
|
name=file_name,
|
|
description=description,
|
|
annotations=annot,
|
|
)
|
|
|
|
else:
|
|
# use the file name as SeqRecord.name if available
|
|
try:
|
|
file_name = basename(handle.name).replace(".ab1", "")
|
|
except AttributeError:
|
|
file_name = ""
|
|
record = SeqRecord(
|
|
Seq(seq),
|
|
id=sample_id,
|
|
name=file_name,
|
|
description="",
|
|
annotations=annot,
|
|
)
|
|
if qual:
|
|
# Expect this to be missing for FSA files.
|
|
record.letter_annotations["phred_quality"] = qual
|
|
elif not is_fsa_file and not qual and self.trim:
|
|
raise ValueError(
|
|
"The 'abi-trim' format can not be used for files without"
|
|
" quality values."
|
|
)
|
|
|
|
if self.trim and not is_fsa_file:
|
|
record = _abi_trim(record)
|
|
|
|
record.annotations["molecule_type"] = "DNA"
|
|
yield record
|
|
|
|
|
|
def _AbiTrimIterator(handle):
|
|
"""Return an iterator for the Abi file format that yields trimmed SeqRecord objects (PRIVATE)."""
|
|
return AbiIterator(handle, trim=True)
|
|
|
|
|
|
def _abi_parse_header(header, handle):
|
|
"""Return directory contents (PRIVATE)."""
|
|
# header structure (after ABIF marker):
|
|
# file version, tag name, tag number,
|
|
# element type code, element size, number of elements
|
|
# data size, data offset, handle (not file handle)
|
|
head_elem_size = header[4]
|
|
head_elem_num = header[5]
|
|
head_offset = header[7]
|
|
index = 0
|
|
|
|
while index < head_elem_num:
|
|
start = head_offset + index * head_elem_size
|
|
# add directory offset to tuple
|
|
# to handle directories with data size <= 4 bytes
|
|
handle.seek(start)
|
|
dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (
|
|
start,
|
|
)
|
|
index += 1
|
|
# only parse desired dirs
|
|
key = dir_entry[0].decode()
|
|
key += str(dir_entry[1])
|
|
|
|
tag_name = dir_entry[0].decode()
|
|
tag_number = dir_entry[1]
|
|
elem_code = dir_entry[2]
|
|
elem_num = dir_entry[4]
|
|
data_size = dir_entry[5]
|
|
data_offset = dir_entry[6]
|
|
tag_offset = dir_entry[8]
|
|
# if data size <= 4 bytes, data is stored inside tag
|
|
# so offset needs to be changed
|
|
if data_size <= 4:
|
|
data_offset = tag_offset + 20
|
|
handle.seek(data_offset)
|
|
data = handle.read(data_size)
|
|
yield tag_name, tag_number, _parse_tag_data(elem_code, elem_num, data)
|
|
|
|
|
|
def _abi_trim(seq_record):
|
|
"""Trims the sequence using Richard Mott's modified trimming algorithm (PRIVATE).
|
|
|
|
Arguments:
|
|
- seq_record - SeqRecord object to be trimmed.
|
|
|
|
Trimmed bases are determined from their segment score, which is a
|
|
cumulative sum of each base's score. Base scores are calculated from
|
|
their quality values.
|
|
|
|
More about the trimming algorithm:
|
|
http://www.phrap.org/phredphrap/phred.html
|
|
http://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Quality_trimming.html
|
|
"""
|
|
start = False # flag for starting position of trimmed sequence
|
|
segment = 20 # minimum sequence length
|
|
trim_start = 0 # init start index
|
|
cutoff = 0.05 # default cutoff value for calculating base score
|
|
|
|
if len(seq_record) <= segment:
|
|
return seq_record
|
|
else:
|
|
# calculate base score
|
|
score_list = [
|
|
cutoff - (10 ** (qual / -10.0))
|
|
for qual in seq_record.letter_annotations["phred_quality"]
|
|
]
|
|
|
|
# calculate cumulative score
|
|
# if cumulative value < 0, set it to 0
|
|
# first value is set to 0, because of the assumption that
|
|
# the first base will always be trimmed out
|
|
cummul_score = [0]
|
|
for i in range(1, len(score_list)):
|
|
score = cummul_score[-1] + score_list[i]
|
|
if score < 0:
|
|
cummul_score.append(0)
|
|
else:
|
|
cummul_score.append(score)
|
|
if not start:
|
|
# trim_start = value when cumulative score is first > 0
|
|
trim_start = i
|
|
start = True
|
|
|
|
# trim_finish = index of highest cumulative score,
|
|
# marking the end of sequence segment with highest cumulative score
|
|
trim_finish = cummul_score.index(max(cummul_score))
|
|
|
|
return seq_record[trim_start:trim_finish]
|
|
|
|
|
|
def _parse_tag_data(elem_code, elem_num, raw_data):
|
|
"""Return single data value (PRIVATE).
|
|
|
|
Arguments:
|
|
- elem_code - What kind of data
|
|
- elem_num - How many data points
|
|
- raw_data - abi file object from which the tags would be unpacked
|
|
|
|
"""
|
|
if elem_code in _BYTEFMT:
|
|
# because '>1s' unpack differently from '>s'
|
|
if elem_num == 1:
|
|
num = ""
|
|
else:
|
|
num = str(elem_num)
|
|
fmt = ">" + num + _BYTEFMT[elem_code]
|
|
|
|
assert len(raw_data) == struct.calcsize(fmt)
|
|
data = struct.unpack(fmt, raw_data)
|
|
|
|
# no need to use tuple if len(data) == 1
|
|
# also if data is date / time
|
|
if elem_code not in [10, 11] and len(data) == 1:
|
|
data = data[0]
|
|
|
|
# account for different data types
|
|
if elem_code == 2:
|
|
return data
|
|
elif elem_code == 10:
|
|
return str(datetime.date(*data))
|
|
elif elem_code == 11:
|
|
return str(datetime.time(*data[:3]))
|
|
elif elem_code == 13:
|
|
return bool(data)
|
|
elif elem_code == 18:
|
|
return data[1:]
|
|
elif elem_code == 19:
|
|
return data[:-1]
|
|
else:
|
|
return data
|
|
else:
|
|
return None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pass
|