Parser for Affy CEL file version 4

Backwards compatibility with version 3 is maintained. Test cases for package Affy are added. Both new and old code is now tested. affy_v4_example.CEL is added. It is artifically generated and small. See also: http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx Which explains: Version 3 is generated by the MAS software. This was also known as the ASCII version. Version 4 is generated by the GCOS software. This was also known as the binary or XDA version. Command Console version 1 is generated by the Command Console software. This is stored in the Command Console "generic" data file format. (Squashed commit of pull reqesut #1011)
2025-10-20 13:43:47 +08:00 · 2016-11-27 19:06:22 +00:00
parent caa9b7a302
commit 6c14a26dda
5 changed files with 389 additions and 24 deletions
--- a/Bio/Affy/CelFile.py
+++ b/Bio/Affy/CelFile.py
@ -1,20 +1,28 @@
-# Copyright 2004 by Harry Zuzan.  All rights reserved.
+# Copyright 2004 by Harry Zuzan. All rights reserved.
+# Copyright 2016 by Adam Kurkiewicz. All rights reserved.
 # This code is part of the Biopython distribution and governed by its
 # license.  Please see the LICENSE file that should have been included
 # as part of this package.

 """
-Classes for accessing the information in Affymetrix cel files.
-
-Functions:
-read      Read a cel file and store its contents in a Record
-
-Classes:
-Record    Contains the information from a cel file
+Reading information from Affymetrix CEL files version 3 and 4.
 """

-# We use print in the doctests
 from __future__ import print_function
+import sys
+import struct
+
+try:
+    import numpy
+except ImportError:
+    from Bio import MissingPythonDependencyError
+    raise MissingPythonDependencyError(
+        "Install NumPy if you want to use Bio.Affy.CelFile")
+
+# for debugging
+# import pprint
+# pp = pprint.PrettyPrinter(indent=4)
+

 try:
    import numpy
@ -30,7 +38,7 @@ class Record(object):
    Example usage:

    >>> from Bio.Affy import CelFile
-    >>> with open('Affy/affy_v3_example.CEL') as handle:
+    >>> with open("Affy/affy_v3_example.CEL", "r") as handle:
    ...     c = CelFile.read(handle)
    ...
    >>> print(c.ncols, c.nrows)
@ -78,8 +86,206 @@ class Record(object):


 def read(handle):
+    """ Reads Affymetrix CEL file and returns Record object.
+
+    CEL files version 3 and 4 are supported, and the parser attempts version detection.
+
+    Example Usage:
+
+    >>> from Bio.Affy import CelFile
+    >>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
+    ...     c = CelFile.read(handle)
+    ...
+    >>> c.version == 4
+    True
    """
-    Read the information in a cel file, and store it in a Record.
+    # If we fail to read the magic number, then it will remain None, and thus
+    # we will invoke read_v3 (if mode is not strict), or raise IOError if mode is
+    # strict.
+    magicNumber = None
+    # We check if the handle is a file-like object. If it isn't, and the mode
+    # is strict, we raise an error. If it isn't and the mode isn't strict, we
+    # continue (perhaps somebody has got a CEL-file-like iterable, which used
+    # to work with previous versions of biopython and we don't want to maintain
+    # backwards compatibility).
+    try:
+        mode = handle.mode
+        # By definition an Affymetrix v4 CEL file has 64 as the first 4 bytes.
+        # Note that we use little-endian irrespective of the platform, again by
+        # definition.
+        position = handle.tell()
+        magicNumber = struct.unpack("<i", handle.read(4))[0]
+    except (AttributeError, TypeError):
+        pass
+    finally:
+        try:
+            # reset the offset, to avoid breaking either v3 or v4.
+            handle.seek(position)
+        except AttributeError:
+            pass
+
+    if magicNumber != 64:
+        return read_v3(handle)
+
+    else:
+        # In v4 we're always strict, as we don't have to worry about backwards
+        # compatibility
+        if mode != "rb":
+            raise IOError("You're trying to open an Affymetrix v4 CEL file. "
+                          "You have to use a read binary mode, like this "
+                          "`open(filename \"rb\")`.")
+        return read_v4(handle)
+
+
+# read Affymetrix files version 4.
+def read_v4(f):
+    """ Reads Affymetrix CEL file, version 4, and returns a corresponding Record object.
+
+    Most importantly record.intensities correspond to intensities from the CEL file.
+
+    record.mask and record.outliers are not set.
+
+    Example Usage:
+
+    >>> from Bio.Affy import CelFile
+    >>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
+    ...     c = CelFile.read_v4(handle)
+    ...
+    >>> c.version == 4
+    True
+    >>> print(c.intensities.shape)
+    (5, 5)
+    """
+    # We follow the documentation here:
+    # http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx
+    record = Record()
+    preHeaders = ["magic", "version", "columns", "rows", "cellNo", "headerLen"]
+    preHeadersMap = dict()
+    headersMap = dict()
+
+    # load pre-headers
+    for name in preHeaders:
+        preHeadersMap[name] = struct.unpack("<i", f.read(4))[0]
+    char = f.read(preHeadersMap["headerLen"])
+    header = char.decode("ascii", "ignore")
+    for header in header.split("\n"):
+        if "=" in header:
+            header = header.split("=")
+            headersMap[header[0]] = "=".join(header[1:])
+
+    # for debugging
+    # pp.pprint("preHeadersMap")
+    # pp.pprint(preHeadersMap)
+    # pp.pprint("headersMap")
+    # pp.pprint(headersMap)
+
+    record.version = preHeadersMap["version"]
+    if record.version != 4:
+        raise ValueError("You are trying to parse CEL file version 4. This file"
+                         " violates the structure expected from CEL file"
+                         " version 4")
+    record.GridCornerUL = headersMap["GridCornerUL"]
+    record.GridCornerUR = headersMap["GridCornerUR"]
+    record.GridCornerLR = headersMap["GridCornerLR"]
+    record.GridCornerLL = headersMap["GridCornerLL"]
+    record.DatHeader = headersMap["DatHeader"]
+    record.Algorithm = headersMap["Algorithm"]
+    record.AlgorithmParameters = headersMap["AlgorithmParameters"]
+    record.NumberCells = preHeadersMap["cellNo"]
+    # record.intensities are set below
+    # record.stdevs are set below
+    # record.npix are set below
+    record.nrows = int(headersMap["Rows"])
+    record.ncols = int(headersMap["Cols"])
+
+    # These cannot be reliably set in v4, because of discrepancies between real
+    # data and the documented format.
+    record.nmask = None
+    record.mask = None
+    record.noutliers = None
+    record.outliers = None
+    record.modified = None
+
+    # Real data never seems to have anything but zeros here, but we don't want
+    # to take chances. Raising an error is better than returning unreliable
+    # data.
+    def raiseBadHeader(field, expected):
+        actual = int(headersMap[field])
+        message = "The header {field} is expected to be 0, not {value}".format(value=actual, field=field)
+        if actual != expected:
+            raise ValueError(message)
+
+    raiseBadHeader("Axis-invertX", 0)
+
+    raiseBadHeader("AxisInvertY", 0)
+
+    raiseBadHeader("OffsetX", 0)
+
+    raiseBadHeader("OffsetY", 0)
+
+    # This is unfortunately undocumented, but it turns out that real data has
+    # the `record.AlgorithmParameters` repeated in the data section, until an
+    # EOF, i.e. b"\x04".
+    char = b"\x00"
+    safetyValve = 10**4
+    for i in range(safetyValve):
+        char = f.read(1)
+        # For debugging
+        # print([i for i in char], end="")
+        if char == b"\x04":
+            break
+        if i == safetyValve:
+            raise ValueError("Parse Error. The parser expects a short, "
+                             "undocumented binary blob terminating with "
+                             "ASCII EOF, x04")
+
+    # After that there are precisely 15 bytes padded. Again, undocumented.
+    padding = f.read(15)
+
+    # That's how we pull out the values (triplets of the form float, float,
+    # signed short).
+    structa = struct.Struct("< f f h")
+
+    # There are 10 bytes in our struct.
+    structSize = 10
+
+    # We initialise the most important: intensities, stdevs and npixs.
+    record.intensities = numpy.empty(record.NumberCells, dtype=float)
+    record.stdevs = numpy.empty(record.NumberCells, dtype=float)
+    record.npix = numpy.empty(record.NumberCells, dtype=int)
+
+    b = f.read(structSize * record.NumberCells)
+    for i in range(record.NumberCells):
+        binaryFragment = b[i * structSize: (i + 1) * structSize]
+        intensity, stdevs, npix = structa.unpack(binaryFragment)
+        record.intensities[i] = intensity
+        record.stdevs[i] = stdevs
+        record.npix[i] = npix
+
+    # reshape without copying.
+    def reshape(array):
+        view = array.view()
+        view.shape = (record.nrows, record.ncols)
+        return view
+
+    record.intensities = reshape(record.intensities)
+    record.stdevs = reshape(record.stdevs)
+    record.npix = reshape(record.npix)
+
+    return record
+
+
+def read_v3(handle):
+    """ Reads Affymetrix CEL file, version 3, and returns a corresponding Record object.
+
+    Example Usage:
+
+    >>> from Bio.Affy import CelFile
+    >>> with open("Affy/affy_v3_example.CEL", "r") as handle:
+    ...     c = CelFile.read_v3(handle)
+    ...
+    >>> c.version == 3
+    True
    """
    # Needs error handling.
    # Needs to know the chip design.
@ -112,7 +318,7 @@ def read(handle):
            section = ""
        elif section == "CEL":
            keyword, value = line.split("=", 1)
-            if keyword == 'Version':
+            if keyword == "Version":
                record.version = int(value)
        elif section == "HEADER":
            # Set record.ncols and record.nrows, remaining data goes into
@ -122,24 +328,24 @@ def read(handle):
                record.ncols = int(value)
            elif keyword == "Rows":
                record.nrows = int(value)
-            elif keyword == 'GridCornerUL':
+            elif keyword == "GridCornerUL":
                x, y = value.split()
                record.GridCornerUL = (int(x), int(y))
-            elif keyword == 'GridCornerUR':
+            elif keyword == "GridCornerUR":
                x, y = value.split()
                record.GridCornerUR = (int(x), int(y))
-            elif keyword == 'GridCornerLR':
+            elif keyword == "GridCornerLR":
                x, y = value.split()
                record.GridCornerLR = (int(x), int(y))
-            elif keyword == 'GridCornerLL':
+            elif keyword == "GridCornerLL":
                x, y = value.split()
                record.GridCornerLL = (int(x), int(y))
-            elif keyword == 'DatHeader':
-                record.DatHeader = value.strip('\n\r')
-            elif keyword == 'Algorithm':
-                record.Algorithm = value.strip('\n\r')
-            elif keyword == 'AlgorithmParameters':
-                record.AlgorithmParameters = value.strip('\n\r')
+            elif keyword == "DatHeader":
+                record.DatHeader = value.strip("\n\r")
+            elif keyword == "Algorithm":
+                record.Algorithm = value.strip("\n\r")
+            elif keyword == "AlgorithmParameters":
+                record.AlgorithmParameters = value.strip("\n\r")
        elif section == "INTENSITY":
            if "NumberCells" in line:
                record.NumberCells = int(line.split("=", 1)[1])
--- a/1
+++ b/1
@ -79,6 +79,7 @@ Frank Kauff <fkauff at domain duke.edu>
 Siong Kong
 David Koppstein <https://github.com/dkoppstein>
 Andreas Kuntzagk <andreas.kuntzagk at domain mdc-berlin.de>
+Adam Kurkiewicz <adam@kurkiewicz.pl>
 Michal Kurowski <michal at domain genesilico.pl>
 Gleb Kuznetsov <https://github.com/glebkuznetsov>
 Uri Laserson <https://github.com/laserson>
--- a/9
+++ b/9
@ -40,6 +40,9 @@ patent data files, and the related IMGT parser was updated to cope with
 IPD-IMGT/HLA database files after release v3.16.0 when their ID line changed.
 The GenBank output now uses colon space to match current NCBI DBLINK lines.

+The Bio.Affy package supports Affymetrix version 4 of the CEL file format,
+in addition to version 3.
+
 Additionally, a number of small bugs have been fixed with further additions
 to the test suite, and there has been further work to follow the Python PEP8,
 PEP257 and best practice standard coding style.
@ -47,9 +50,11 @@ PEP257 and best practice standard coding style.
 Many thanks to the Biopython developers and community for making this release
 possible, especially the following contributors:

+
+Aaron Rosenfeld
+Adam Kurkiewicz (first contribution)
 Adrian Altenhoff (first contribution)
 Allis Tauri (first contribution)
-Aaron Rosenfeld
 Bernhard Thiel (first contribution)
 Bertrand Néron
 Brandon Invergo
@ -57,8 +62,8 @@ Carlos Pena
 Carlos Ríos
 Chris Warth
 Emmanuel Noutahi
-Markus Piotrowski
 Lenna Peterson
+Markus Piotrowski
 Peter Cock
 Richard Neher (first contribution)
 Sourav Singh (first contribution)
--- a/Tests/Affy/affy_v4_example.CEL
+++ b/Tests/Affy/affy_v4_example.CEL
--- a/Tests/test_Affy.py
+++ b/Tests/test_Affy.py
@ -0,0 +1,153 @@
+import unittest
+
+from Bio.Affy import CelFile
+import struct
+import os
+
+try:
+    from numpy import array
+    import numpy.testing
+except ImportError:
+    from Bio import MissingPythonDependencyError
+    raise MissingPythonDependencyError(
+        "Install NumPy if you want to use Bio.Affy.CelFile")
+
+
+class AffyTest(unittest.TestCase):
+    def setUp(self):
+        self.affy3 = "Affy/affy_v3_example.CEL"
+        self.affy4 = "Affy/affy_v4_example.CEL"
+        self.affy4Bad = "Affy/affy_v4_bad_example.CEL"
+        with open(self.affy4Bad, "wb") as f:
+            self.writeExampleV4(f, bad=True)
+
+    def tearDown(self):
+        os.remove(self.affy4Bad)
+
+    # tests if the new strict mode complains about passing a non-file
+    def testAffyStrict(self):
+        with self.assertRaises(IOError):
+            record = CelFile.read("hello", strict=True)
+
+    # tests if the new strict mode is backwards compatible
+    def testAffyStrict(self):
+        record = CelFile.read("hello")
+        assert record.DatHeader is None
+
+    # tests the old Affymetrix v3 parser
+    def testAffy3(self):
+        with open(self.affy3, "r") as f:
+            record = CelFile.read(f)
+            assert(len(record.DatHeader) > 0)
+            assert(record.intensities.shape == (5, 5))
+            assert(record.intensities.shape == record.stdevs.shape)
+            assert(record.intensities.shape == record.npix.shape)
+            assert(record.ncols == 5)
+            assert(record.nrows == 5)
+
+    # tests the new Affymetrix v4 parser
+    def testAffy4(self):
+        with open(self.affy4, "rb") as f:
+            record = CelFile.read(f)
+            assert(record.intensities.shape == (5, 5))
+            assert(record.intensities.shape == record.stdevs.shape)
+            assert(record.intensities.shape == record.npix.shape)
+            assert(record.ncols == 5)
+            assert(record.nrows == 5)
+            numpy.testing.assert_allclose(record.intensities,
+                                          [[0., 1., 2., 3., 4.],
+                                           [5., 6., 7., 8., 9.],
+                                           [10., 11., 12., 13., 14.],
+                                           [15., 16., 17., 18., 19.],
+                                           [20., 21., 22., 23., 24.]])
+            numpy.testing.assert_allclose(record.stdevs,
+                                          [[0., -1., -2., -3., -4.],
+                                           [-5., -6., -7., -8., -9.],
+                                           [-10., -11., -12., -13., -14.],
+                                           [-15., -16., -17., -18., -19.],
+                                           [-20., -21., -22., -23., -24.]])
+            numpy.testing.assert_allclose(record.npix,
+                                          [[9, 9, 9, 9, 9],
+                                           [9, 9, 9, 9, 9],
+                                           [9, 9, 9, 9, 9],
+                                           [9, 9, 9, 9, 9],
+                                           [9, 9, 9, 9, 9]])
+            assert(len(record.AlgorithmParameters) == 329)
+            assert(len(record.GridCornerUL) == 7)
+            assert(record.AlgorithmParameters[-3:] == '169')
+
+    def testAffyBadHeader(self):
+        with self.assertRaises(ValueError):
+            with open(self.affy4Bad, "rb") as f:
+                record = CelFile.read(f)
+
+    # Writes a small example Affymetrix V4 CEL File
+    def writeExampleV4(self, f, bad=False):
+        preHeaders = {'cellNo': 25,
+                      'columns': 5,
+                      'headerLen': 752,
+                      'magic': 64,
+                      'rows': 5,
+                      'version': 4}
+        goodH = {u'Axis-invertX': b'0'}
+        badH = {u'Axis-invertX': b'1'}
+
+        headers = {u'Algorithm': b'Percentile',
+                   u'AlgorithmParameters': b'Percentile:75;CellMargin:4;Outlie'
+                   b'rHigh:1.500;OutlierLow:1.004;AlgVersion:6.0;FixedCellSize'
+                   b':TRUE;FullFeatureWidth:7;FullFeatureHeight:7;IgnoreOutlie'
+                   b'rsInShiftRows:FALSE;FeatureExtraction:TRUE;PoolWidthExten'
+                   b'stion:1;PoolHeightExtension:1;UseSubgrids:FALSE;Randomize'
+                   b'Pixels:FALSE;ErrorBasis:StdvMean;StdMult:1.000000;NumDATS'
+                   b'ubgrids:169',
+                   u'AxisInvertY': b'0',
+                   u'Cols': b'5',
+                   u'DatHeader': b'[0..65534]  20_10N:CLS=19420RWS=19420XIN=0'
+                   b' YIN=0  VE=30        2.0 05/25/05 23:19:07 50102310  M10 '
+                   b'  \x14  \x14 HuEx-1_0-st-v2.1sq \x14  \x14  \x14  \x14  '
+                   b'\x14570 \x14 25540.671875 \x14 3.500000 \x14 0.7000 \x14'
+                   b' 3',
+                   u'GridCornerLL': b'518 18668',
+                   u'GridCornerLR': b'18800 18825',
+                   u'GridCornerUL': b'659 469',
+                   u'GridCornerUR': b'18942 623',
+                   u'OffsetX': b'0',
+                   u'OffsetY': b'0',
+                   u'Rows': b'5',
+                   u'TotalX': b'2560',
+                   u'TotalY': b'2560',
+                   u'swapXY': b'0'}
+        if not bad:
+            headers.update(goodH)
+        else:
+            headers.update(badH)
+        prePadding = b"this text doesn't matter and is ignored\x04"
+        preHeadersOrder = ["magic",
+                           "version",
+                           "columns",
+                           "rows",
+                           "cellNo",
+                           "headerLen"]
+        headersEncoded = struct.pack('<' + 'i' * len(preHeadersOrder),
+                                     *[preHeaders[header] for header in
+                                     preHeadersOrder])
+
+        def packData(intensity, sdev, pixel):
+            return struct.pack("< f f h", intensity, sdev, pixel)
+        f.write(headersEncoded)
+        for header in headers:
+            try:
+                f.write(bytes(header, encoding="utf-8") +
+                        b"=" +
+                        headers[header] +
+                        b"\n")
+            except TypeError:
+                f.write(header + b"=" + headers[header] + b"\n")
+        f.write(prePadding)
+        f.write(b"\x00" * 15)
+        for i in range(25):
+            f.write(packData(float(i), float(-i), 9))
+
+if __name__ == "__main__":
+    runner = unittest.TextTestRunner(verbosity=0)
+    unittest.main(testRunner=runner)