mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Parser for Affy CEL file version 4
Backwards compatibility with version 3 is maintained. Test cases for package Affy are added. Both new and old code is now tested. affy_v4_example.CEL is added. It is artifically generated and small. See also: http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx Which explains: Version 3 is generated by the MAS software. This was also known as the ASCII version. Version 4 is generated by the GCOS software. This was also known as the binary or XDA version. Command Console version 1 is generated by the Command Console software. This is stored in the Command Console "generic" data file format. (Squashed commit of pull reqesut #1011)
This commit is contained in:
committed by
Peter Cock
parent
caa9b7a302
commit
6c14a26dda
@ -1,20 +1,28 @@
|
||||
# Copyright 2004 by Harry Zuzan. All rights reserved.
|
||||
# Copyright 2004 by Harry Zuzan. All rights reserved.
|
||||
# Copyright 2016 by Adam Kurkiewicz. All rights reserved.
|
||||
# This code is part of the Biopython distribution and governed by its
|
||||
# license. Please see the LICENSE file that should have been included
|
||||
# as part of this package.
|
||||
|
||||
"""
|
||||
Classes for accessing the information in Affymetrix cel files.
|
||||
|
||||
Functions:
|
||||
read Read a cel file and store its contents in a Record
|
||||
|
||||
Classes:
|
||||
Record Contains the information from a cel file
|
||||
Reading information from Affymetrix CEL files version 3 and 4.
|
||||
"""
|
||||
|
||||
# We use print in the doctests
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import struct
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
from Bio import MissingPythonDependencyError
|
||||
raise MissingPythonDependencyError(
|
||||
"Install NumPy if you want to use Bio.Affy.CelFile")
|
||||
|
||||
# for debugging
|
||||
# import pprint
|
||||
# pp = pprint.PrettyPrinter(indent=4)
|
||||
|
||||
|
||||
try:
|
||||
import numpy
|
||||
@ -30,7 +38,7 @@ class Record(object):
|
||||
Example usage:
|
||||
|
||||
>>> from Bio.Affy import CelFile
|
||||
>>> with open('Affy/affy_v3_example.CEL') as handle:
|
||||
>>> with open("Affy/affy_v3_example.CEL", "r") as handle:
|
||||
... c = CelFile.read(handle)
|
||||
...
|
||||
>>> print(c.ncols, c.nrows)
|
||||
@ -78,8 +86,206 @@ class Record(object):
|
||||
|
||||
|
||||
def read(handle):
|
||||
""" Reads Affymetrix CEL file and returns Record object.
|
||||
|
||||
CEL files version 3 and 4 are supported, and the parser attempts version detection.
|
||||
|
||||
Example Usage:
|
||||
|
||||
>>> from Bio.Affy import CelFile
|
||||
>>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
|
||||
... c = CelFile.read(handle)
|
||||
...
|
||||
>>> c.version == 4
|
||||
True
|
||||
"""
|
||||
Read the information in a cel file, and store it in a Record.
|
||||
# If we fail to read the magic number, then it will remain None, and thus
|
||||
# we will invoke read_v3 (if mode is not strict), or raise IOError if mode is
|
||||
# strict.
|
||||
magicNumber = None
|
||||
# We check if the handle is a file-like object. If it isn't, and the mode
|
||||
# is strict, we raise an error. If it isn't and the mode isn't strict, we
|
||||
# continue (perhaps somebody has got a CEL-file-like iterable, which used
|
||||
# to work with previous versions of biopython and we don't want to maintain
|
||||
# backwards compatibility).
|
||||
try:
|
||||
mode = handle.mode
|
||||
# By definition an Affymetrix v4 CEL file has 64 as the first 4 bytes.
|
||||
# Note that we use little-endian irrespective of the platform, again by
|
||||
# definition.
|
||||
position = handle.tell()
|
||||
magicNumber = struct.unpack("<i", handle.read(4))[0]
|
||||
except (AttributeError, TypeError):
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
# reset the offset, to avoid breaking either v3 or v4.
|
||||
handle.seek(position)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
if magicNumber != 64:
|
||||
return read_v3(handle)
|
||||
|
||||
else:
|
||||
# In v4 we're always strict, as we don't have to worry about backwards
|
||||
# compatibility
|
||||
if mode != "rb":
|
||||
raise IOError("You're trying to open an Affymetrix v4 CEL file. "
|
||||
"You have to use a read binary mode, like this "
|
||||
"`open(filename \"rb\")`.")
|
||||
return read_v4(handle)
|
||||
|
||||
|
||||
# read Affymetrix files version 4.
|
||||
def read_v4(f):
|
||||
""" Reads Affymetrix CEL file, version 4, and returns a corresponding Record object.
|
||||
|
||||
Most importantly record.intensities correspond to intensities from the CEL file.
|
||||
|
||||
record.mask and record.outliers are not set.
|
||||
|
||||
Example Usage:
|
||||
|
||||
>>> from Bio.Affy import CelFile
|
||||
>>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
|
||||
... c = CelFile.read_v4(handle)
|
||||
...
|
||||
>>> c.version == 4
|
||||
True
|
||||
>>> print(c.intensities.shape)
|
||||
(5, 5)
|
||||
"""
|
||||
# We follow the documentation here:
|
||||
# http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx
|
||||
record = Record()
|
||||
preHeaders = ["magic", "version", "columns", "rows", "cellNo", "headerLen"]
|
||||
preHeadersMap = dict()
|
||||
headersMap = dict()
|
||||
|
||||
# load pre-headers
|
||||
for name in preHeaders:
|
||||
preHeadersMap[name] = struct.unpack("<i", f.read(4))[0]
|
||||
char = f.read(preHeadersMap["headerLen"])
|
||||
header = char.decode("ascii", "ignore")
|
||||
for header in header.split("\n"):
|
||||
if "=" in header:
|
||||
header = header.split("=")
|
||||
headersMap[header[0]] = "=".join(header[1:])
|
||||
|
||||
# for debugging
|
||||
# pp.pprint("preHeadersMap")
|
||||
# pp.pprint(preHeadersMap)
|
||||
# pp.pprint("headersMap")
|
||||
# pp.pprint(headersMap)
|
||||
|
||||
record.version = preHeadersMap["version"]
|
||||
if record.version != 4:
|
||||
raise ValueError("You are trying to parse CEL file version 4. This file"
|
||||
" violates the structure expected from CEL file"
|
||||
" version 4")
|
||||
record.GridCornerUL = headersMap["GridCornerUL"]
|
||||
record.GridCornerUR = headersMap["GridCornerUR"]
|
||||
record.GridCornerLR = headersMap["GridCornerLR"]
|
||||
record.GridCornerLL = headersMap["GridCornerLL"]
|
||||
record.DatHeader = headersMap["DatHeader"]
|
||||
record.Algorithm = headersMap["Algorithm"]
|
||||
record.AlgorithmParameters = headersMap["AlgorithmParameters"]
|
||||
record.NumberCells = preHeadersMap["cellNo"]
|
||||
# record.intensities are set below
|
||||
# record.stdevs are set below
|
||||
# record.npix are set below
|
||||
record.nrows = int(headersMap["Rows"])
|
||||
record.ncols = int(headersMap["Cols"])
|
||||
|
||||
# These cannot be reliably set in v4, because of discrepancies between real
|
||||
# data and the documented format.
|
||||
record.nmask = None
|
||||
record.mask = None
|
||||
record.noutliers = None
|
||||
record.outliers = None
|
||||
record.modified = None
|
||||
|
||||
# Real data never seems to have anything but zeros here, but we don't want
|
||||
# to take chances. Raising an error is better than returning unreliable
|
||||
# data.
|
||||
def raiseBadHeader(field, expected):
|
||||
actual = int(headersMap[field])
|
||||
message = "The header {field} is expected to be 0, not {value}".format(value=actual, field=field)
|
||||
if actual != expected:
|
||||
raise ValueError(message)
|
||||
|
||||
raiseBadHeader("Axis-invertX", 0)
|
||||
|
||||
raiseBadHeader("AxisInvertY", 0)
|
||||
|
||||
raiseBadHeader("OffsetX", 0)
|
||||
|
||||
raiseBadHeader("OffsetY", 0)
|
||||
|
||||
# This is unfortunately undocumented, but it turns out that real data has
|
||||
# the `record.AlgorithmParameters` repeated in the data section, until an
|
||||
# EOF, i.e. b"\x04".
|
||||
char = b"\x00"
|
||||
safetyValve = 10**4
|
||||
for i in range(safetyValve):
|
||||
char = f.read(1)
|
||||
# For debugging
|
||||
# print([i for i in char], end="")
|
||||
if char == b"\x04":
|
||||
break
|
||||
if i == safetyValve:
|
||||
raise ValueError("Parse Error. The parser expects a short, "
|
||||
"undocumented binary blob terminating with "
|
||||
"ASCII EOF, x04")
|
||||
|
||||
# After that there are precisely 15 bytes padded. Again, undocumented.
|
||||
padding = f.read(15)
|
||||
|
||||
# That's how we pull out the values (triplets of the form float, float,
|
||||
# signed short).
|
||||
structa = struct.Struct("< f f h")
|
||||
|
||||
# There are 10 bytes in our struct.
|
||||
structSize = 10
|
||||
|
||||
# We initialise the most important: intensities, stdevs and npixs.
|
||||
record.intensities = numpy.empty(record.NumberCells, dtype=float)
|
||||
record.stdevs = numpy.empty(record.NumberCells, dtype=float)
|
||||
record.npix = numpy.empty(record.NumberCells, dtype=int)
|
||||
|
||||
b = f.read(structSize * record.NumberCells)
|
||||
for i in range(record.NumberCells):
|
||||
binaryFragment = b[i * structSize: (i + 1) * structSize]
|
||||
intensity, stdevs, npix = structa.unpack(binaryFragment)
|
||||
record.intensities[i] = intensity
|
||||
record.stdevs[i] = stdevs
|
||||
record.npix[i] = npix
|
||||
|
||||
# reshape without copying.
|
||||
def reshape(array):
|
||||
view = array.view()
|
||||
view.shape = (record.nrows, record.ncols)
|
||||
return view
|
||||
|
||||
record.intensities = reshape(record.intensities)
|
||||
record.stdevs = reshape(record.stdevs)
|
||||
record.npix = reshape(record.npix)
|
||||
|
||||
return record
|
||||
|
||||
|
||||
def read_v3(handle):
|
||||
""" Reads Affymetrix CEL file, version 3, and returns a corresponding Record object.
|
||||
|
||||
Example Usage:
|
||||
|
||||
>>> from Bio.Affy import CelFile
|
||||
>>> with open("Affy/affy_v3_example.CEL", "r") as handle:
|
||||
... c = CelFile.read_v3(handle)
|
||||
...
|
||||
>>> c.version == 3
|
||||
True
|
||||
"""
|
||||
# Needs error handling.
|
||||
# Needs to know the chip design.
|
||||
@ -112,7 +318,7 @@ def read(handle):
|
||||
section = ""
|
||||
elif section == "CEL":
|
||||
keyword, value = line.split("=", 1)
|
||||
if keyword == 'Version':
|
||||
if keyword == "Version":
|
||||
record.version = int(value)
|
||||
elif section == "HEADER":
|
||||
# Set record.ncols and record.nrows, remaining data goes into
|
||||
@ -122,24 +328,24 @@ def read(handle):
|
||||
record.ncols = int(value)
|
||||
elif keyword == "Rows":
|
||||
record.nrows = int(value)
|
||||
elif keyword == 'GridCornerUL':
|
||||
elif keyword == "GridCornerUL":
|
||||
x, y = value.split()
|
||||
record.GridCornerUL = (int(x), int(y))
|
||||
elif keyword == 'GridCornerUR':
|
||||
elif keyword == "GridCornerUR":
|
||||
x, y = value.split()
|
||||
record.GridCornerUR = (int(x), int(y))
|
||||
elif keyword == 'GridCornerLR':
|
||||
elif keyword == "GridCornerLR":
|
||||
x, y = value.split()
|
||||
record.GridCornerLR = (int(x), int(y))
|
||||
elif keyword == 'GridCornerLL':
|
||||
elif keyword == "GridCornerLL":
|
||||
x, y = value.split()
|
||||
record.GridCornerLL = (int(x), int(y))
|
||||
elif keyword == 'DatHeader':
|
||||
record.DatHeader = value.strip('\n\r')
|
||||
elif keyword == 'Algorithm':
|
||||
record.Algorithm = value.strip('\n\r')
|
||||
elif keyword == 'AlgorithmParameters':
|
||||
record.AlgorithmParameters = value.strip('\n\r')
|
||||
elif keyword == "DatHeader":
|
||||
record.DatHeader = value.strip("\n\r")
|
||||
elif keyword == "Algorithm":
|
||||
record.Algorithm = value.strip("\n\r")
|
||||
elif keyword == "AlgorithmParameters":
|
||||
record.AlgorithmParameters = value.strip("\n\r")
|
||||
elif section == "INTENSITY":
|
||||
if "NumberCells" in line:
|
||||
record.NumberCells = int(line.split("=", 1)[1])
|
||||
|
1
CONTRIB
1
CONTRIB
@ -79,6 +79,7 @@ Frank Kauff <fkauff at domain duke.edu>
|
||||
Siong Kong
|
||||
David Koppstein <https://github.com/dkoppstein>
|
||||
Andreas Kuntzagk <andreas.kuntzagk at domain mdc-berlin.de>
|
||||
Adam Kurkiewicz <adam@kurkiewicz.pl>
|
||||
Michal Kurowski <michal at domain genesilico.pl>
|
||||
Gleb Kuznetsov <https://github.com/glebkuznetsov>
|
||||
Uri Laserson <https://github.com/laserson>
|
||||
|
9
NEWS
9
NEWS
@ -40,6 +40,9 @@ patent data files, and the related IMGT parser was updated to cope with
|
||||
IPD-IMGT/HLA database files after release v3.16.0 when their ID line changed.
|
||||
The GenBank output now uses colon space to match current NCBI DBLINK lines.
|
||||
|
||||
The Bio.Affy package supports Affymetrix version 4 of the CEL file format,
|
||||
in addition to version 3.
|
||||
|
||||
Additionally, a number of small bugs have been fixed with further additions
|
||||
to the test suite, and there has been further work to follow the Python PEP8,
|
||||
PEP257 and best practice standard coding style.
|
||||
@ -47,9 +50,11 @@ PEP257 and best practice standard coding style.
|
||||
Many thanks to the Biopython developers and community for making this release
|
||||
possible, especially the following contributors:
|
||||
|
||||
|
||||
Aaron Rosenfeld
|
||||
Adam Kurkiewicz (first contribution)
|
||||
Adrian Altenhoff (first contribution)
|
||||
Allis Tauri (first contribution)
|
||||
Aaron Rosenfeld
|
||||
Bernhard Thiel (first contribution)
|
||||
Bertrand Néron
|
||||
Brandon Invergo
|
||||
@ -57,8 +62,8 @@ Carlos Pena
|
||||
Carlos Ríos
|
||||
Chris Warth
|
||||
Emmanuel Noutahi
|
||||
Markus Piotrowski
|
||||
Lenna Peterson
|
||||
Markus Piotrowski
|
||||
Peter Cock
|
||||
Richard Neher (first contribution)
|
||||
Sourav Singh (first contribution)
|
||||
|
BIN
Tests/Affy/affy_v4_example.CEL
Normal file
BIN
Tests/Affy/affy_v4_example.CEL
Normal file
Binary file not shown.
153
Tests/test_Affy.py
Normal file
153
Tests/test_Affy.py
Normal file
@ -0,0 +1,153 @@
|
||||
import unittest
|
||||
|
||||
from Bio.Affy import CelFile
|
||||
import struct
|
||||
import os
|
||||
|
||||
try:
|
||||
from numpy import array
|
||||
import numpy.testing
|
||||
except ImportError:
|
||||
from Bio import MissingPythonDependencyError
|
||||
raise MissingPythonDependencyError(
|
||||
"Install NumPy if you want to use Bio.Affy.CelFile")
|
||||
|
||||
|
||||
class AffyTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.affy3 = "Affy/affy_v3_example.CEL"
|
||||
self.affy4 = "Affy/affy_v4_example.CEL"
|
||||
self.affy4Bad = "Affy/affy_v4_bad_example.CEL"
|
||||
with open(self.affy4Bad, "wb") as f:
|
||||
self.writeExampleV4(f, bad=True)
|
||||
|
||||
def tearDown(self):
|
||||
os.remove(self.affy4Bad)
|
||||
|
||||
# tests if the new strict mode complains about passing a non-file
|
||||
def testAffyStrict(self):
|
||||
with self.assertRaises(IOError):
|
||||
record = CelFile.read("hello", strict=True)
|
||||
|
||||
# tests if the new strict mode is backwards compatible
|
||||
def testAffyStrict(self):
|
||||
record = CelFile.read("hello")
|
||||
assert record.DatHeader is None
|
||||
|
||||
# tests the old Affymetrix v3 parser
|
||||
def testAffy3(self):
|
||||
with open(self.affy3, "r") as f:
|
||||
record = CelFile.read(f)
|
||||
assert(len(record.DatHeader) > 0)
|
||||
assert(record.intensities.shape == (5, 5))
|
||||
assert(record.intensities.shape == record.stdevs.shape)
|
||||
assert(record.intensities.shape == record.npix.shape)
|
||||
assert(record.ncols == 5)
|
||||
assert(record.nrows == 5)
|
||||
|
||||
# tests the new Affymetrix v4 parser
|
||||
def testAffy4(self):
|
||||
with open(self.affy4, "rb") as f:
|
||||
record = CelFile.read(f)
|
||||
assert(record.intensities.shape == (5, 5))
|
||||
assert(record.intensities.shape == record.stdevs.shape)
|
||||
assert(record.intensities.shape == record.npix.shape)
|
||||
assert(record.ncols == 5)
|
||||
assert(record.nrows == 5)
|
||||
numpy.testing.assert_allclose(record.intensities,
|
||||
[[0., 1., 2., 3., 4.],
|
||||
[5., 6., 7., 8., 9.],
|
||||
[10., 11., 12., 13., 14.],
|
||||
[15., 16., 17., 18., 19.],
|
||||
[20., 21., 22., 23., 24.]])
|
||||
numpy.testing.assert_allclose(record.stdevs,
|
||||
[[0., -1., -2., -3., -4.],
|
||||
[-5., -6., -7., -8., -9.],
|
||||
[-10., -11., -12., -13., -14.],
|
||||
[-15., -16., -17., -18., -19.],
|
||||
[-20., -21., -22., -23., -24.]])
|
||||
numpy.testing.assert_allclose(record.npix,
|
||||
[[9, 9, 9, 9, 9],
|
||||
[9, 9, 9, 9, 9],
|
||||
[9, 9, 9, 9, 9],
|
||||
[9, 9, 9, 9, 9],
|
||||
[9, 9, 9, 9, 9]])
|
||||
assert(len(record.AlgorithmParameters) == 329)
|
||||
assert(len(record.GridCornerUL) == 7)
|
||||
assert(record.AlgorithmParameters[-3:] == '169')
|
||||
|
||||
def testAffyBadHeader(self):
|
||||
with self.assertRaises(ValueError):
|
||||
with open(self.affy4Bad, "rb") as f:
|
||||
record = CelFile.read(f)
|
||||
|
||||
# Writes a small example Affymetrix V4 CEL File
|
||||
def writeExampleV4(self, f, bad=False):
|
||||
preHeaders = {'cellNo': 25,
|
||||
'columns': 5,
|
||||
'headerLen': 752,
|
||||
'magic': 64,
|
||||
'rows': 5,
|
||||
'version': 4}
|
||||
goodH = {u'Axis-invertX': b'0'}
|
||||
badH = {u'Axis-invertX': b'1'}
|
||||
|
||||
headers = {u'Algorithm': b'Percentile',
|
||||
u'AlgorithmParameters': b'Percentile:75;CellMargin:4;Outlie'
|
||||
b'rHigh:1.500;OutlierLow:1.004;AlgVersion:6.0;FixedCellSize'
|
||||
b':TRUE;FullFeatureWidth:7;FullFeatureHeight:7;IgnoreOutlie'
|
||||
b'rsInShiftRows:FALSE;FeatureExtraction:TRUE;PoolWidthExten'
|
||||
b'stion:1;PoolHeightExtension:1;UseSubgrids:FALSE;Randomize'
|
||||
b'Pixels:FALSE;ErrorBasis:StdvMean;StdMult:1.000000;NumDATS'
|
||||
b'ubgrids:169',
|
||||
u'AxisInvertY': b'0',
|
||||
u'Cols': b'5',
|
||||
u'DatHeader': b'[0..65534] 20_10N:CLS=19420RWS=19420XIN=0'
|
||||
b' YIN=0 VE=30 2.0 05/25/05 23:19:07 50102310 M10 '
|
||||
b' \x14 \x14 HuEx-1_0-st-v2.1sq \x14 \x14 \x14 \x14 '
|
||||
b'\x14570 \x14 25540.671875 \x14 3.500000 \x14 0.7000 \x14'
|
||||
b' 3',
|
||||
u'GridCornerLL': b'518 18668',
|
||||
u'GridCornerLR': b'18800 18825',
|
||||
u'GridCornerUL': b'659 469',
|
||||
u'GridCornerUR': b'18942 623',
|
||||
u'OffsetX': b'0',
|
||||
u'OffsetY': b'0',
|
||||
u'Rows': b'5',
|
||||
u'TotalX': b'2560',
|
||||
u'TotalY': b'2560',
|
||||
u'swapXY': b'0'}
|
||||
if not bad:
|
||||
headers.update(goodH)
|
||||
else:
|
||||
headers.update(badH)
|
||||
prePadding = b"this text doesn't matter and is ignored\x04"
|
||||
preHeadersOrder = ["magic",
|
||||
"version",
|
||||
"columns",
|
||||
"rows",
|
||||
"cellNo",
|
||||
"headerLen"]
|
||||
headersEncoded = struct.pack('<' + 'i' * len(preHeadersOrder),
|
||||
*[preHeaders[header] for header in
|
||||
preHeadersOrder])
|
||||
|
||||
def packData(intensity, sdev, pixel):
|
||||
return struct.pack("< f f h", intensity, sdev, pixel)
|
||||
f.write(headersEncoded)
|
||||
for header in headers:
|
||||
try:
|
||||
f.write(bytes(header, encoding="utf-8") +
|
||||
b"=" +
|
||||
headers[header] +
|
||||
b"\n")
|
||||
except TypeError:
|
||||
f.write(header + b"=" + headers[header] + b"\n")
|
||||
f.write(prePadding)
|
||||
f.write(b"\x00" * 15)
|
||||
for i in range(25):
|
||||
f.write(packData(float(i), float(-i), 9))
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = unittest.TextTestRunner(verbosity=0)
|
||||
unittest.main(testRunner=runner)
|
Reference in New Issue
Block a user