Parser for Affy CEL file version 4

Backwards compatibility with version 3 is maintained.

Test cases for package Affy are added. Both new and old code is now tested.

affy_v4_example.CEL is added. It is artifically generated and small.

See also:

http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx

Which explains:

    Version 3 is generated by the MAS software. This was also
    known as the ASCII version.

    Version 4 is generated by the GCOS software. This was also
    known as the binary or XDA version.

    Command Console version 1 is generated by the Command
    Console software. This is stored in the Command Console
    "generic" data file format.

(Squashed commit of pull reqesut #1011)
This commit is contained in:
Adam Kurkiewicz
2016-11-27 19:06:22 +00:00
committed by Peter Cock
parent caa9b7a302
commit 6c14a26dda
5 changed files with 389 additions and 24 deletions

View File

@ -1,20 +1,28 @@
# Copyright 2004 by Harry Zuzan. All rights reserved.
# Copyright 2004 by Harry Zuzan. All rights reserved.
# Copyright 2016 by Adam Kurkiewicz. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""
Classes for accessing the information in Affymetrix cel files.
Functions:
read Read a cel file and store its contents in a Record
Classes:
Record Contains the information from a cel file
Reading information from Affymetrix CEL files version 3 and 4.
"""
# We use print in the doctests
from __future__ import print_function
import sys
import struct
try:
import numpy
except ImportError:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError(
"Install NumPy if you want to use Bio.Affy.CelFile")
# for debugging
# import pprint
# pp = pprint.PrettyPrinter(indent=4)
try:
import numpy
@ -30,7 +38,7 @@ class Record(object):
Example usage:
>>> from Bio.Affy import CelFile
>>> with open('Affy/affy_v3_example.CEL') as handle:
>>> with open("Affy/affy_v3_example.CEL", "r") as handle:
... c = CelFile.read(handle)
...
>>> print(c.ncols, c.nrows)
@ -78,8 +86,206 @@ class Record(object):
def read(handle):
""" Reads Affymetrix CEL file and returns Record object.
CEL files version 3 and 4 are supported, and the parser attempts version detection.
Example Usage:
>>> from Bio.Affy import CelFile
>>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
... c = CelFile.read(handle)
...
>>> c.version == 4
True
"""
Read the information in a cel file, and store it in a Record.
# If we fail to read the magic number, then it will remain None, and thus
# we will invoke read_v3 (if mode is not strict), or raise IOError if mode is
# strict.
magicNumber = None
# We check if the handle is a file-like object. If it isn't, and the mode
# is strict, we raise an error. If it isn't and the mode isn't strict, we
# continue (perhaps somebody has got a CEL-file-like iterable, which used
# to work with previous versions of biopython and we don't want to maintain
# backwards compatibility).
try:
mode = handle.mode
# By definition an Affymetrix v4 CEL file has 64 as the first 4 bytes.
# Note that we use little-endian irrespective of the platform, again by
# definition.
position = handle.tell()
magicNumber = struct.unpack("<i", handle.read(4))[0]
except (AttributeError, TypeError):
pass
finally:
try:
# reset the offset, to avoid breaking either v3 or v4.
handle.seek(position)
except AttributeError:
pass
if magicNumber != 64:
return read_v3(handle)
else:
# In v4 we're always strict, as we don't have to worry about backwards
# compatibility
if mode != "rb":
raise IOError("You're trying to open an Affymetrix v4 CEL file. "
"You have to use a read binary mode, like this "
"`open(filename \"rb\")`.")
return read_v4(handle)
# read Affymetrix files version 4.
def read_v4(f):
""" Reads Affymetrix CEL file, version 4, and returns a corresponding Record object.
Most importantly record.intensities correspond to intensities from the CEL file.
record.mask and record.outliers are not set.
Example Usage:
>>> from Bio.Affy import CelFile
>>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
... c = CelFile.read_v4(handle)
...
>>> c.version == 4
True
>>> print(c.intensities.shape)
(5, 5)
"""
# We follow the documentation here:
# http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx
record = Record()
preHeaders = ["magic", "version", "columns", "rows", "cellNo", "headerLen"]
preHeadersMap = dict()
headersMap = dict()
# load pre-headers
for name in preHeaders:
preHeadersMap[name] = struct.unpack("<i", f.read(4))[0]
char = f.read(preHeadersMap["headerLen"])
header = char.decode("ascii", "ignore")
for header in header.split("\n"):
if "=" in header:
header = header.split("=")
headersMap[header[0]] = "=".join(header[1:])
# for debugging
# pp.pprint("preHeadersMap")
# pp.pprint(preHeadersMap)
# pp.pprint("headersMap")
# pp.pprint(headersMap)
record.version = preHeadersMap["version"]
if record.version != 4:
raise ValueError("You are trying to parse CEL file version 4. This file"
" violates the structure expected from CEL file"
" version 4")
record.GridCornerUL = headersMap["GridCornerUL"]
record.GridCornerUR = headersMap["GridCornerUR"]
record.GridCornerLR = headersMap["GridCornerLR"]
record.GridCornerLL = headersMap["GridCornerLL"]
record.DatHeader = headersMap["DatHeader"]
record.Algorithm = headersMap["Algorithm"]
record.AlgorithmParameters = headersMap["AlgorithmParameters"]
record.NumberCells = preHeadersMap["cellNo"]
# record.intensities are set below
# record.stdevs are set below
# record.npix are set below
record.nrows = int(headersMap["Rows"])
record.ncols = int(headersMap["Cols"])
# These cannot be reliably set in v4, because of discrepancies between real
# data and the documented format.
record.nmask = None
record.mask = None
record.noutliers = None
record.outliers = None
record.modified = None
# Real data never seems to have anything but zeros here, but we don't want
# to take chances. Raising an error is better than returning unreliable
# data.
def raiseBadHeader(field, expected):
actual = int(headersMap[field])
message = "The header {field} is expected to be 0, not {value}".format(value=actual, field=field)
if actual != expected:
raise ValueError(message)
raiseBadHeader("Axis-invertX", 0)
raiseBadHeader("AxisInvertY", 0)
raiseBadHeader("OffsetX", 0)
raiseBadHeader("OffsetY", 0)
# This is unfortunately undocumented, but it turns out that real data has
# the `record.AlgorithmParameters` repeated in the data section, until an
# EOF, i.e. b"\x04".
char = b"\x00"
safetyValve = 10**4
for i in range(safetyValve):
char = f.read(1)
# For debugging
# print([i for i in char], end="")
if char == b"\x04":
break
if i == safetyValve:
raise ValueError("Parse Error. The parser expects a short, "
"undocumented binary blob terminating with "
"ASCII EOF, x04")
# After that there are precisely 15 bytes padded. Again, undocumented.
padding = f.read(15)
# That's how we pull out the values (triplets of the form float, float,
# signed short).
structa = struct.Struct("< f f h")
# There are 10 bytes in our struct.
structSize = 10
# We initialise the most important: intensities, stdevs and npixs.
record.intensities = numpy.empty(record.NumberCells, dtype=float)
record.stdevs = numpy.empty(record.NumberCells, dtype=float)
record.npix = numpy.empty(record.NumberCells, dtype=int)
b = f.read(structSize * record.NumberCells)
for i in range(record.NumberCells):
binaryFragment = b[i * structSize: (i + 1) * structSize]
intensity, stdevs, npix = structa.unpack(binaryFragment)
record.intensities[i] = intensity
record.stdevs[i] = stdevs
record.npix[i] = npix
# reshape without copying.
def reshape(array):
view = array.view()
view.shape = (record.nrows, record.ncols)
return view
record.intensities = reshape(record.intensities)
record.stdevs = reshape(record.stdevs)
record.npix = reshape(record.npix)
return record
def read_v3(handle):
""" Reads Affymetrix CEL file, version 3, and returns a corresponding Record object.
Example Usage:
>>> from Bio.Affy import CelFile
>>> with open("Affy/affy_v3_example.CEL", "r") as handle:
... c = CelFile.read_v3(handle)
...
>>> c.version == 3
True
"""
# Needs error handling.
# Needs to know the chip design.
@ -112,7 +318,7 @@ def read(handle):
section = ""
elif section == "CEL":
keyword, value = line.split("=", 1)
if keyword == 'Version':
if keyword == "Version":
record.version = int(value)
elif section == "HEADER":
# Set record.ncols and record.nrows, remaining data goes into
@ -122,24 +328,24 @@ def read(handle):
record.ncols = int(value)
elif keyword == "Rows":
record.nrows = int(value)
elif keyword == 'GridCornerUL':
elif keyword == "GridCornerUL":
x, y = value.split()
record.GridCornerUL = (int(x), int(y))
elif keyword == 'GridCornerUR':
elif keyword == "GridCornerUR":
x, y = value.split()
record.GridCornerUR = (int(x), int(y))
elif keyword == 'GridCornerLR':
elif keyword == "GridCornerLR":
x, y = value.split()
record.GridCornerLR = (int(x), int(y))
elif keyword == 'GridCornerLL':
elif keyword == "GridCornerLL":
x, y = value.split()
record.GridCornerLL = (int(x), int(y))
elif keyword == 'DatHeader':
record.DatHeader = value.strip('\n\r')
elif keyword == 'Algorithm':
record.Algorithm = value.strip('\n\r')
elif keyword == 'AlgorithmParameters':
record.AlgorithmParameters = value.strip('\n\r')
elif keyword == "DatHeader":
record.DatHeader = value.strip("\n\r")
elif keyword == "Algorithm":
record.Algorithm = value.strip("\n\r")
elif keyword == "AlgorithmParameters":
record.AlgorithmParameters = value.strip("\n\r")
elif section == "INTENSITY":
if "NumberCells" in line:
record.NumberCells = int(line.split("=", 1)[1])

View File

@ -79,6 +79,7 @@ Frank Kauff <fkauff at domain duke.edu>
Siong Kong
David Koppstein <https://github.com/dkoppstein>
Andreas Kuntzagk <andreas.kuntzagk at domain mdc-berlin.de>
Adam Kurkiewicz <adam@kurkiewicz.pl>
Michal Kurowski <michal at domain genesilico.pl>
Gleb Kuznetsov <https://github.com/glebkuznetsov>
Uri Laserson <https://github.com/laserson>

9
NEWS
View File

@ -40,6 +40,9 @@ patent data files, and the related IMGT parser was updated to cope with
IPD-IMGT/HLA database files after release v3.16.0 when their ID line changed.
The GenBank output now uses colon space to match current NCBI DBLINK lines.
The Bio.Affy package supports Affymetrix version 4 of the CEL file format,
in addition to version 3.
Additionally, a number of small bugs have been fixed with further additions
to the test suite, and there has been further work to follow the Python PEP8,
PEP257 and best practice standard coding style.
@ -47,9 +50,11 @@ PEP257 and best practice standard coding style.
Many thanks to the Biopython developers and community for making this release
possible, especially the following contributors:
Aaron Rosenfeld
Adam Kurkiewicz (first contribution)
Adrian Altenhoff (first contribution)
Allis Tauri (first contribution)
Aaron Rosenfeld
Bernhard Thiel (first contribution)
Bertrand Néron
Brandon Invergo
@ -57,8 +62,8 @@ Carlos Pena
Carlos Ríos
Chris Warth
Emmanuel Noutahi
Markus Piotrowski
Lenna Peterson
Markus Piotrowski
Peter Cock
Richard Neher (first contribution)
Sourav Singh (first contribution)

Binary file not shown.

153
Tests/test_Affy.py Normal file
View File

@ -0,0 +1,153 @@
import unittest
from Bio.Affy import CelFile
import struct
import os
try:
from numpy import array
import numpy.testing
except ImportError:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError(
"Install NumPy if you want to use Bio.Affy.CelFile")
class AffyTest(unittest.TestCase):
def setUp(self):
self.affy3 = "Affy/affy_v3_example.CEL"
self.affy4 = "Affy/affy_v4_example.CEL"
self.affy4Bad = "Affy/affy_v4_bad_example.CEL"
with open(self.affy4Bad, "wb") as f:
self.writeExampleV4(f, bad=True)
def tearDown(self):
os.remove(self.affy4Bad)
# tests if the new strict mode complains about passing a non-file
def testAffyStrict(self):
with self.assertRaises(IOError):
record = CelFile.read("hello", strict=True)
# tests if the new strict mode is backwards compatible
def testAffyStrict(self):
record = CelFile.read("hello")
assert record.DatHeader is None
# tests the old Affymetrix v3 parser
def testAffy3(self):
with open(self.affy3, "r") as f:
record = CelFile.read(f)
assert(len(record.DatHeader) > 0)
assert(record.intensities.shape == (5, 5))
assert(record.intensities.shape == record.stdevs.shape)
assert(record.intensities.shape == record.npix.shape)
assert(record.ncols == 5)
assert(record.nrows == 5)
# tests the new Affymetrix v4 parser
def testAffy4(self):
with open(self.affy4, "rb") as f:
record = CelFile.read(f)
assert(record.intensities.shape == (5, 5))
assert(record.intensities.shape == record.stdevs.shape)
assert(record.intensities.shape == record.npix.shape)
assert(record.ncols == 5)
assert(record.nrows == 5)
numpy.testing.assert_allclose(record.intensities,
[[0., 1., 2., 3., 4.],
[5., 6., 7., 8., 9.],
[10., 11., 12., 13., 14.],
[15., 16., 17., 18., 19.],
[20., 21., 22., 23., 24.]])
numpy.testing.assert_allclose(record.stdevs,
[[0., -1., -2., -3., -4.],
[-5., -6., -7., -8., -9.],
[-10., -11., -12., -13., -14.],
[-15., -16., -17., -18., -19.],
[-20., -21., -22., -23., -24.]])
numpy.testing.assert_allclose(record.npix,
[[9, 9, 9, 9, 9],
[9, 9, 9, 9, 9],
[9, 9, 9, 9, 9],
[9, 9, 9, 9, 9],
[9, 9, 9, 9, 9]])
assert(len(record.AlgorithmParameters) == 329)
assert(len(record.GridCornerUL) == 7)
assert(record.AlgorithmParameters[-3:] == '169')
def testAffyBadHeader(self):
with self.assertRaises(ValueError):
with open(self.affy4Bad, "rb") as f:
record = CelFile.read(f)
# Writes a small example Affymetrix V4 CEL File
def writeExampleV4(self, f, bad=False):
preHeaders = {'cellNo': 25,
'columns': 5,
'headerLen': 752,
'magic': 64,
'rows': 5,
'version': 4}
goodH = {u'Axis-invertX': b'0'}
badH = {u'Axis-invertX': b'1'}
headers = {u'Algorithm': b'Percentile',
u'AlgorithmParameters': b'Percentile:75;CellMargin:4;Outlie'
b'rHigh:1.500;OutlierLow:1.004;AlgVersion:6.0;FixedCellSize'
b':TRUE;FullFeatureWidth:7;FullFeatureHeight:7;IgnoreOutlie'
b'rsInShiftRows:FALSE;FeatureExtraction:TRUE;PoolWidthExten'
b'stion:1;PoolHeightExtension:1;UseSubgrids:FALSE;Randomize'
b'Pixels:FALSE;ErrorBasis:StdvMean;StdMult:1.000000;NumDATS'
b'ubgrids:169',
u'AxisInvertY': b'0',
u'Cols': b'5',
u'DatHeader': b'[0..65534] 20_10N:CLS=19420RWS=19420XIN=0'
b' YIN=0 VE=30 2.0 05/25/05 23:19:07 50102310 M10 '
b' \x14 \x14 HuEx-1_0-st-v2.1sq \x14 \x14 \x14 \x14 '
b'\x14570 \x14 25540.671875 \x14 3.500000 \x14 0.7000 \x14'
b' 3',
u'GridCornerLL': b'518 18668',
u'GridCornerLR': b'18800 18825',
u'GridCornerUL': b'659 469',
u'GridCornerUR': b'18942 623',
u'OffsetX': b'0',
u'OffsetY': b'0',
u'Rows': b'5',
u'TotalX': b'2560',
u'TotalY': b'2560',
u'swapXY': b'0'}
if not bad:
headers.update(goodH)
else:
headers.update(badH)
prePadding = b"this text doesn't matter and is ignored\x04"
preHeadersOrder = ["magic",
"version",
"columns",
"rows",
"cellNo",
"headerLen"]
headersEncoded = struct.pack('<' + 'i' * len(preHeadersOrder),
*[preHeaders[header] for header in
preHeadersOrder])
def packData(intensity, sdev, pixel):
return struct.pack("< f f h", intensity, sdev, pixel)
f.write(headersEncoded)
for header in headers:
try:
f.write(bytes(header, encoding="utf-8") +
b"=" +
headers[header] +
b"\n")
except TypeError:
f.write(header + b"=" + headers[header] + b"\n")
f.write(prePadding)
f.write(b"\x00" * 15)
for i in range(25):
f.write(packData(float(i), float(-i), 9))
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=0)
unittest.main(testRunner=runner)