From 6c14a26dda32ad6d3147036a01f3d0c4d306c647 Mon Sep 17 00:00:00 2001 From: Adam Kurkiewicz Date: Sun, 27 Nov 2016 19:06:22 +0000 Subject: [PATCH] Parser for Affy CEL file version 4 Backwards compatibility with version 3 is maintained. Test cases for package Affy are added. Both new and old code is now tested. affy_v4_example.CEL is added. It is artifically generated and small. See also: http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx Which explains: Version 3 is generated by the MAS software. This was also known as the ASCII version. Version 4 is generated by the GCOS software. This was also known as the binary or XDA version. Command Console version 1 is generated by the Command Console software. This is stored in the Command Console "generic" data file format. (Squashed commit of pull reqesut #1011) --- Bio/Affy/CelFile.py | 250 ++++++++++++++++++++++++++++++--- CONTRIB | 1 + NEWS | 9 +- Tests/Affy/affy_v4_example.CEL | Bin 0 -> 1079 bytes Tests/test_Affy.py | 153 ++++++++++++++++++++ 5 files changed, 389 insertions(+), 24 deletions(-) create mode 100644 Tests/Affy/affy_v4_example.CEL create mode 100644 Tests/test_Affy.py diff --git a/Bio/Affy/CelFile.py b/Bio/Affy/CelFile.py index db82f4e5d..d48515e2e 100644 --- a/Bio/Affy/CelFile.py +++ b/Bio/Affy/CelFile.py @@ -1,20 +1,28 @@ -# Copyright 2004 by Harry Zuzan. All rights reserved. +# Copyright 2004 by Harry Zuzan. All rights reserved. +# Copyright 2016 by Adam Kurkiewicz. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """ -Classes for accessing the information in Affymetrix cel files. - -Functions: -read Read a cel file and store its contents in a Record - -Classes: -Record Contains the information from a cel file +Reading information from Affymetrix CEL files version 3 and 4. """ -# We use print in the doctests from __future__ import print_function +import sys +import struct + +try: + import numpy +except ImportError: + from Bio import MissingPythonDependencyError + raise MissingPythonDependencyError( + "Install NumPy if you want to use Bio.Affy.CelFile") + +# for debugging +# import pprint +# pp = pprint.PrettyPrinter(indent=4) + try: import numpy @@ -30,7 +38,7 @@ class Record(object): Example usage: >>> from Bio.Affy import CelFile - >>> with open('Affy/affy_v3_example.CEL') as handle: + >>> with open("Affy/affy_v3_example.CEL", "r") as handle: ... c = CelFile.read(handle) ... >>> print(c.ncols, c.nrows) @@ -78,8 +86,206 @@ class Record(object): def read(handle): + """ Reads Affymetrix CEL file and returns Record object. + + CEL files version 3 and 4 are supported, and the parser attempts version detection. + + Example Usage: + + >>> from Bio.Affy import CelFile + >>> with open("Affy/affy_v4_example.CEL", "rb") as handle: + ... c = CelFile.read(handle) + ... + >>> c.version == 4 + True """ - Read the information in a cel file, and store it in a Record. + # If we fail to read the magic number, then it will remain None, and thus + # we will invoke read_v3 (if mode is not strict), or raise IOError if mode is + # strict. + magicNumber = None + # We check if the handle is a file-like object. If it isn't, and the mode + # is strict, we raise an error. If it isn't and the mode isn't strict, we + # continue (perhaps somebody has got a CEL-file-like iterable, which used + # to work with previous versions of biopython and we don't want to maintain + # backwards compatibility). + try: + mode = handle.mode + # By definition an Affymetrix v4 CEL file has 64 as the first 4 bytes. + # Note that we use little-endian irrespective of the platform, again by + # definition. + position = handle.tell() + magicNumber = struct.unpack(">> from Bio.Affy import CelFile + >>> with open("Affy/affy_v4_example.CEL", "rb") as handle: + ... c = CelFile.read_v4(handle) + ... + >>> c.version == 4 + True + >>> print(c.intensities.shape) + (5, 5) + """ + # We follow the documentation here: + # http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx + record = Record() + preHeaders = ["magic", "version", "columns", "rows", "cellNo", "headerLen"] + preHeadersMap = dict() + headersMap = dict() + + # load pre-headers + for name in preHeaders: + preHeadersMap[name] = struct.unpack(">> from Bio.Affy import CelFile + >>> with open("Affy/affy_v3_example.CEL", "r") as handle: + ... c = CelFile.read_v3(handle) + ... + >>> c.version == 3 + True """ # Needs error handling. # Needs to know the chip design. @@ -112,7 +318,7 @@ def read(handle): section = "" elif section == "CEL": keyword, value = line.split("=", 1) - if keyword == 'Version': + if keyword == "Version": record.version = int(value) elif section == "HEADER": # Set record.ncols and record.nrows, remaining data goes into @@ -122,24 +328,24 @@ def read(handle): record.ncols = int(value) elif keyword == "Rows": record.nrows = int(value) - elif keyword == 'GridCornerUL': + elif keyword == "GridCornerUL": x, y = value.split() record.GridCornerUL = (int(x), int(y)) - elif keyword == 'GridCornerUR': + elif keyword == "GridCornerUR": x, y = value.split() record.GridCornerUR = (int(x), int(y)) - elif keyword == 'GridCornerLR': + elif keyword == "GridCornerLR": x, y = value.split() record.GridCornerLR = (int(x), int(y)) - elif keyword == 'GridCornerLL': + elif keyword == "GridCornerLL": x, y = value.split() record.GridCornerLL = (int(x), int(y)) - elif keyword == 'DatHeader': - record.DatHeader = value.strip('\n\r') - elif keyword == 'Algorithm': - record.Algorithm = value.strip('\n\r') - elif keyword == 'AlgorithmParameters': - record.AlgorithmParameters = value.strip('\n\r') + elif keyword == "DatHeader": + record.DatHeader = value.strip("\n\r") + elif keyword == "Algorithm": + record.Algorithm = value.strip("\n\r") + elif keyword == "AlgorithmParameters": + record.AlgorithmParameters = value.strip("\n\r") elif section == "INTENSITY": if "NumberCells" in line: record.NumberCells = int(line.split("=", 1)[1]) diff --git a/CONTRIB b/CONTRIB index 01aa88490..97ae08ad9 100644 --- a/CONTRIB +++ b/CONTRIB @@ -79,6 +79,7 @@ Frank Kauff Siong Kong David Koppstein Andreas Kuntzagk +Adam Kurkiewicz Michal Kurowski Gleb Kuznetsov Uri Laserson diff --git a/NEWS b/NEWS index 6a0bfcbaa..e9254ad0f 100644 --- a/NEWS +++ b/NEWS @@ -40,6 +40,9 @@ patent data files, and the related IMGT parser was updated to cope with IPD-IMGT/HLA database files after release v3.16.0 when their ID line changed. The GenBank output now uses colon space to match current NCBI DBLINK lines. +The Bio.Affy package supports Affymetrix version 4 of the CEL file format, +in addition to version 3. + Additionally, a number of small bugs have been fixed with further additions to the test suite, and there has been further work to follow the Python PEP8, PEP257 and best practice standard coding style. @@ -47,9 +50,11 @@ PEP257 and best practice standard coding style. Many thanks to the Biopython developers and community for making this release possible, especially the following contributors: + +Aaron Rosenfeld +Adam Kurkiewicz (first contribution) Adrian Altenhoff (first contribution) Allis Tauri (first contribution) -Aaron Rosenfeld Bernhard Thiel (first contribution) Bertrand Néron Brandon Invergo @@ -57,8 +62,8 @@ Carlos Pena Carlos Ríos Chris Warth Emmanuel Noutahi -Markus Piotrowski Lenna Peterson +Markus Piotrowski Peter Cock Richard Neher (first contribution) Sourav Singh (first contribution) diff --git a/Tests/Affy/affy_v4_example.CEL b/Tests/Affy/affy_v4_example.CEL new file mode 100644 index 0000000000000000000000000000000000000000..9dc05e342d5a71e71a948739973017b6cbff36d1 GIT binary patch literal 1079 zcmZwGF>~556aZjfuN}&ip-VSyop2-@*{ON$t`lR% zj2Rg-X3SRlPx2G`6Z#}Wn%-51#OEjRWciu8VHmHWzuENn@csLhVSE-mYNjG#qSp&3 zsUlK!oNDPXm0>&yEb8FGCxsjFG!$Eb;c4PICidHW!6KDo#D6gFa5!%H z?L3a#ER?xm$2^iV&wUx{Fi0*T)}1D)V4EGYZZewj3#m@ywHv(=q$yIg7E*-gQthX+ z57IQ&X<>pTnVulNiWZrRuv$+1amGgZw<(lsn{*f^Q96fP9l(9Z+h|LORD23Eo_Qk~ zUG-U*_<(*MOMPk#&EL=GdyT{Ad<5#%Sn#aOlPe~qD%D;nJ1mTt2)lu;jnSrACPtX<$vuh1eYJ7|5P&S(AP?JP+e;Y zT57F8E3GwXt@Q+Y(lQ!`Vf<3R-2v@rArR8SAgomf)wKqofz}!5OzRSKskH2OBdZ{{R30 literal 0 HcmV?d00001 diff --git a/Tests/test_Affy.py b/Tests/test_Affy.py new file mode 100644 index 000000000..4e600528b --- /dev/null +++ b/Tests/test_Affy.py @@ -0,0 +1,153 @@ +import unittest + +from Bio.Affy import CelFile +import struct +import os + +try: + from numpy import array + import numpy.testing +except ImportError: + from Bio import MissingPythonDependencyError + raise MissingPythonDependencyError( + "Install NumPy if you want to use Bio.Affy.CelFile") + + +class AffyTest(unittest.TestCase): + def setUp(self): + self.affy3 = "Affy/affy_v3_example.CEL" + self.affy4 = "Affy/affy_v4_example.CEL" + self.affy4Bad = "Affy/affy_v4_bad_example.CEL" + with open(self.affy4Bad, "wb") as f: + self.writeExampleV4(f, bad=True) + + def tearDown(self): + os.remove(self.affy4Bad) + + # tests if the new strict mode complains about passing a non-file + def testAffyStrict(self): + with self.assertRaises(IOError): + record = CelFile.read("hello", strict=True) + + # tests if the new strict mode is backwards compatible + def testAffyStrict(self): + record = CelFile.read("hello") + assert record.DatHeader is None + + # tests the old Affymetrix v3 parser + def testAffy3(self): + with open(self.affy3, "r") as f: + record = CelFile.read(f) + assert(len(record.DatHeader) > 0) + assert(record.intensities.shape == (5, 5)) + assert(record.intensities.shape == record.stdevs.shape) + assert(record.intensities.shape == record.npix.shape) + assert(record.ncols == 5) + assert(record.nrows == 5) + + # tests the new Affymetrix v4 parser + def testAffy4(self): + with open(self.affy4, "rb") as f: + record = CelFile.read(f) + assert(record.intensities.shape == (5, 5)) + assert(record.intensities.shape == record.stdevs.shape) + assert(record.intensities.shape == record.npix.shape) + assert(record.ncols == 5) + assert(record.nrows == 5) + numpy.testing.assert_allclose(record.intensities, + [[0., 1., 2., 3., 4.], + [5., 6., 7., 8., 9.], + [10., 11., 12., 13., 14.], + [15., 16., 17., 18., 19.], + [20., 21., 22., 23., 24.]]) + numpy.testing.assert_allclose(record.stdevs, + [[0., -1., -2., -3., -4.], + [-5., -6., -7., -8., -9.], + [-10., -11., -12., -13., -14.], + [-15., -16., -17., -18., -19.], + [-20., -21., -22., -23., -24.]]) + numpy.testing.assert_allclose(record.npix, + [[9, 9, 9, 9, 9], + [9, 9, 9, 9, 9], + [9, 9, 9, 9, 9], + [9, 9, 9, 9, 9], + [9, 9, 9, 9, 9]]) + assert(len(record.AlgorithmParameters) == 329) + assert(len(record.GridCornerUL) == 7) + assert(record.AlgorithmParameters[-3:] == '169') + + def testAffyBadHeader(self): + with self.assertRaises(ValueError): + with open(self.affy4Bad, "rb") as f: + record = CelFile.read(f) + + # Writes a small example Affymetrix V4 CEL File + def writeExampleV4(self, f, bad=False): + preHeaders = {'cellNo': 25, + 'columns': 5, + 'headerLen': 752, + 'magic': 64, + 'rows': 5, + 'version': 4} + goodH = {u'Axis-invertX': b'0'} + badH = {u'Axis-invertX': b'1'} + + headers = {u'Algorithm': b'Percentile', + u'AlgorithmParameters': b'Percentile:75;CellMargin:4;Outlie' + b'rHigh:1.500;OutlierLow:1.004;AlgVersion:6.0;FixedCellSize' + b':TRUE;FullFeatureWidth:7;FullFeatureHeight:7;IgnoreOutlie' + b'rsInShiftRows:FALSE;FeatureExtraction:TRUE;PoolWidthExten' + b'stion:1;PoolHeightExtension:1;UseSubgrids:FALSE;Randomize' + b'Pixels:FALSE;ErrorBasis:StdvMean;StdMult:1.000000;NumDATS' + b'ubgrids:169', + u'AxisInvertY': b'0', + u'Cols': b'5', + u'DatHeader': b'[0..65534] 20_10N:CLS=19420RWS=19420XIN=0' + b' YIN=0 VE=30 2.0 05/25/05 23:19:07 50102310 M10 ' + b' \x14 \x14 HuEx-1_0-st-v2.1sq \x14 \x14 \x14 \x14 ' + b'\x14570 \x14 25540.671875 \x14 3.500000 \x14 0.7000 \x14' + b' 3', + u'GridCornerLL': b'518 18668', + u'GridCornerLR': b'18800 18825', + u'GridCornerUL': b'659 469', + u'GridCornerUR': b'18942 623', + u'OffsetX': b'0', + u'OffsetY': b'0', + u'Rows': b'5', + u'TotalX': b'2560', + u'TotalY': b'2560', + u'swapXY': b'0'} + if not bad: + headers.update(goodH) + else: + headers.update(badH) + prePadding = b"this text doesn't matter and is ignored\x04" + preHeadersOrder = ["magic", + "version", + "columns", + "rows", + "cellNo", + "headerLen"] + headersEncoded = struct.pack('<' + 'i' * len(preHeadersOrder), + *[preHeaders[header] for header in + preHeadersOrder]) + + def packData(intensity, sdev, pixel): + return struct.pack("< f f h", intensity, sdev, pixel) + f.write(headersEncoded) + for header in headers: + try: + f.write(bytes(header, encoding="utf-8") + + b"=" + + headers[header] + + b"\n") + except TypeError: + f.write(header + b"=" + headers[header] + b"\n") + f.write(prePadding) + f.write(b"\x00" * 15) + for i in range(25): + f.write(packData(float(i), float(-i), 9)) + +if __name__ == "__main__": + runner = unittest.TextTestRunner(verbosity=0) + unittest.main(testRunner=runner)