1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA dataset handling"""
10
11 import unittest
12 import random
13 import numpy as N
14 from sets import Set
15 from mvpa.datasets import Dataset
16 from mvpa.datasets.miscfx import zscore, aggregateFeatures
17 from mvpa.mappers.mask import MaskMapper
18 from mvpa.misc.exceptions import DatasetError
19
20 from tests_warehouse import datasets
21
23
25 data = Dataset(samples=range(5), labels=1, chunks=1)
26
27 self.failUnlessEqual(
28 data.uniquelabels, [1],
29 msg="uniquelabels must be correctly recomputed")
30
31
32 self.failUnlessEqual( data.nsamples, 1)
33
34 self.failUnless(
35 (data.samples == N.array([[0, 1, 2, 3, 4]])).all() )
36
37
38 self.failUnless( (data.labels == N.array([1])).all() )
39 self.failUnless( (data.chunks == N.array([1])).all() )
40
41
42 self.failUnlessRaises( DatasetError,
43 data.__iadd__, Dataset(samples=N.ones((2,3)),
44 labels=1,
45 chunks=1))
46
47
48 data += Dataset(samples=N.random.standard_normal((2,5)),
49 labels=2, chunks=2 )
50 self.failUnlessEqual( data.nfeatures, 5 )
51 self.failUnless( (data.labels == N.array([1,2,2]) ).all() )
52 self.failUnless( (data.chunks == N.array([1,2,2]) ).all() )
53
54
55 data += Dataset(samples=N.random.standard_normal((2,5)), labels=3)
56 self.failUnless( (data.chunks == N.array([1,2,2,0,1]) ).all() )
57
58
59 self.failUnless( (data.uniquelabels == N.array([1,2,3]) ).all() )
60
61
62 self.failUnlessRaises( DatasetError,
63 Dataset,
64 samples=N.random.standard_normal((4,5)),
65 labels=[ 1, 2, 3 ],
66 chunks=2 )
67
68
69 self.failUnlessRaises( DatasetError,
70 Dataset,
71 samples=N.random.standard_normal((4,5)),
72 labels=[ 1, 2, 3, 4 ],
73 chunks=[ 2, 2, 2 ] )
74
75
77 origdata = N.random.standard_normal((10,100))
78 data = Dataset(samples=origdata, labels=2, chunks=2 )
79
80
81 data.defineFeatureGroups(N.repeat(range(4), 25))
82
83 unmasked = data.samples.copy()
84
85
86 self.failUnless( data.nfeatures == 100 )
87
88 bsel = N.array([False]*100)
89 bsel[ [0,20,79] ] = True
90
91 for sel in [ data.selectFeatures( [0,20,79], sort=False ),
92 data.select(slice(None), [0,20,79]),
93 data.select(slice(None), N.array([0,20,79])),
94 data.select(slice(None), bsel),
95 ]:
96 self.failUnless(sel.nfeatures == 3)
97
98
99 self.failUnless( sel.samples.shape == (10,3) )
100
101
102 self.failUnless( (unmasked[:,[0,20,79]]==sel.samples).all() )
103
104
105 self.failUnless((sel._dsattr['featuregroups'] == [0, 0, 3]).all())
106
107
108 gsel = data.selectFeatures(groups=[2,3])
109 self.failUnless(gsel.nfeatures == 50)
110
111
113 origdata = N.random.standard_normal((10,100))
114 data = Dataset(samples=origdata, labels=2, chunks=2 )
115
116 self.failUnless( data.nsamples == 10 )
117
118
119 for sel in [ data.selectSamples(5),
120 data.select(5),
121 data.select(slice(5, 6)),
122 ]:
123 self.failUnless( sel.nsamples == 1 )
124 self.failUnless( data.nfeatures == 100 )
125 self.failUnless( sel.origids == [5] )
126
127
128 for sel in [ data.selectSamples([5, 5]),
129
130
131
132
133
134 ]:
135 self.failUnless( sel.nsamples == 2 )
136 self.failUnless( (sel.samples[0] == data.samples[5]).all() )
137 self.failUnless( (sel.samples[0] == sel.samples[1]).all() )
138 self.failUnless( len(sel.labels) == 2 )
139 self.failUnless( len(sel.chunks) == 2 )
140 self.failUnless((sel.origids == [5, 5]).all())
141
142 self.failUnless( sel.samples.shape == (2,100) )
143
144
145 for sel in [ data.selectSamples(data.idsbylabels(2)),
146 data.select(labels=2),
147 data.select('labels', 2),
148 data.select('labels', [2]),
149 data['labels', [2]],
150 data['labels': [2], 'labels':2],
151 data['labels': [2]],
152 ]:
153 self.failUnless( sel.nsamples == data.nsamples )
154 self.failUnless( N.all(sel.samples == data.samples) )
155
156 for sel in [ data.selectSamples(data.idsbylabels(3)),
157 data.select(labels=3),
158 data.select('labels', 3),
159 data.select('labels', [3]),
160 ]:
161 self.failUnless( sel.nsamples == 0 )
162
163 data = Dataset(samples=origdata,
164 labels=[8, 9, 4, 3, 3, 3, 4, 2, 8, 9],
165 chunks=2)
166 for sel in [ data.selectSamples(data.idsbylabels([2, 3])),
167 data.select('labels', [2, 3]),
168 data.select('labels', [2, 3], labels=[1, 2, 3, 4]),
169 data.select('labels', [2, 3], chunks=[1, 2, 3, 4]),
170 data['labels':[2, 3], 'chunks':[1, 2, 3, 4]],
171 data['chunks':[1, 2, 3, 4], 'labels':[2, 3]],
172 ]:
173 self.failUnless(N.all(sel.origids == [ 3., 4., 5., 7.]))
174
175
176 self.failUnless( (data.uniquelabels == [2, 3, 4, 8, 9]).all() );
177
178
179
180 sel = data.selectSamples(data.idsbylabels([3, 4, 8, 9]))
181 self.failUnlessEqual(Set(sel.uniquelabels), Set([3, 4, 8, 9]))
182 self.failUnless((sel.origids == [0, 1, 2, 3, 4, 5, 6, 8, 9]).all())
183
184
186 """Test some obscure selections of samples via select() or __getitem__
187 """
188 origdata = N.random.standard_normal((10,100))
189 data = Dataset(samples=origdata,
190
191 labels=[8, 9, 4, 3, 3, 3, 3, 2, 8, 9],
192 chunks=[1, 2, 3, 2, 3, 1, 5, 6, 3, 6])
193
194
195 if __debug__:
196
197 self.failUnlessRaises(ValueError, data.__getitem__,
198 'labels', 'featu')
199
200
201 self.failUnlessRaises(ValueError, data.__getitem__, 1, 1, 1)
202
203
204 for sel in [ data.select('chunks', [2, 6], labels=[3, 2],
205 features=slice(None)),
206 data.select('all', 'all', labels=[2,3], chunks=[2, 6]),
207 data['chunks', [2, 6], 'labels', [3, 2]],
208 data[:, :, 'chunks', [2, 6], 'labels', [3, 2]],
209
210 data[3:8, 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
211 ]:
212 self.failUnless(N.all(sel.origids == [3,7]))
213 self.failUnless(sel.nfeatures == 100)
214 self.failUnless(N.all(sel.samples == origdata[ [3,7] ]))
215
216 target = origdata[ [3, 7] ]
217 target = target[:, [1,3] ]
218
219 for sel in [ data.select('all', [1, 3],
220 'chunks', [2, 6], labels=[3, 2]),
221 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
222 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
223
224 data[3:8, [1, 1, 3, 1],
225 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
226 ]:
227 self.failUnless(N.all(sel.origids == [3,7]))
228 self.failUnless(sel.nfeatures == 2)
229 self.failUnless(N.all(sel.samples == target))
230
231
232 self.failUnless(data.select(chunks=[23]).nsamples == 0)
233
234
235 self.failUnless(N.all(data.where(chunks=[2,6]) == [1, 3, 7, 9]))
236 self.failUnless(N.all(data.where(chunks=[2,6], labels=[22, 3]) == [3]))
237
238 idx = data.where('all', [1, 3, 10], labels=[2, 3, 4])
239 self.failUnless(N.all(idx[1] == [1, 3, 10]))
240 self.failUnless(N.all(idx[0] == range(2, 8)))
241
242 self.failUnless(data.where() is None)
243
244 self.failUnless(data.where(labels=[123]) == [])
245
246
260
261
263 data1 = Dataset(samples=N.ones((5,5)), labels=1, chunks=1 )
264 data2 = Dataset(samples=N.ones((3,5)), labels=2, chunks=1 )
265
266 merged = data1 + data2
267
268 self.failUnless( merged.nfeatures == 5 )
269 self.failUnless( (merged.labels == [ 1,1,1,1,1,2,2,2]).all() )
270 self.failUnless( (merged.chunks == [ 1,1,1,1,1,1,1,1]).all() )
271
272 data1 += data2
273
274 self.failUnless( data1.nfeatures == 5 )
275 self.failUnless( (data1.labels == [ 1,1,1,1,1,2,2,2]).all() )
276 self.failUnless( (data1.chunks == [ 1,1,1,1,1,1,1,1]).all() )
277
278
280 data = Dataset(samples=N.ones((5,1)), labels=range(5), chunks=1 )
281 data += Dataset(samples=N.ones((5,1))+1, labels=range(5), chunks=2 )
282 data += Dataset(samples=N.ones((5,1))+2, labels=range(5), chunks=3 )
283 data += Dataset(samples=N.ones((5,1))+3, labels=range(5), chunks=4 )
284 data += Dataset(samples=N.ones((5,1))+4, labels=range(5), chunks=5 )
285 self.failUnless( data.samplesperlabel == {0:5, 1:5, 2:5, 3:5, 4:5} )
286
287
288 sample = data.getRandomSamples( 2 )
289 self.failUnless( sample.samplesperlabel.values() == [ 2,2,2,2,2 ] )
290
291 self.failUnless( (data.uniquechunks == range(1,6)).all() )
292
293
294 origlabels = data.labels.copy()
295
296 data.permuteLabels(True)
297
298 self.failIf( (data.labels == origlabels).all() )
299
300 data.permuteLabels(False)
301
302 self.failUnless( (data.labels == origlabels).all() )
303
304
305 data2 = Dataset(samples=data.samples,
306 labels=data.labels,
307 chunks=data.chunks )
308
309
310 self.failUnless( (data2.labels == origlabels).all() )
311
312
313 data2.permuteLabels( True )
314
315
316 self.failUnless( (data.labels == origlabels).all() )
317
318 self.failIf( (data2.labels == origlabels).all() )
319
320
322
323
324
325 ds = Dataset(samples=range(5), labels=1, chunks=1)
326 self.failUnlessRaises(AttributeError, lambda x:x.blobs, ds)
327 """Dataset.blobs should fail since .blobs wasn't yet registered"""
328
329
330 Dataset._registerAttribute("blobs", "_data", hasunique=True)
331 ds = Dataset(samples=range(5), labels=1, chunks=1)
332 self.failUnless(not ds.blobs != [ 0 ],
333 msg="By default new attributes supposed to get 0 as the value")
334
335 try:
336 ds.blobs = [1,2]
337 self.fail(msg="Dataset.blobs=[1,2] should fail since there is 5 samples")
338 except ValueError, e:
339 pass
340
341 try:
342 ds.blobs = [1]
343 except e:
344 self.fail(msg="We must be able to assign the attribute")
345
346
347
348
349
350
359
360
362
363 samples = N.array( (0,1,3,4,2,2,3,1,1,3,3,1,2,2,2,2) ).\
364 reshape((16, 1))
365 data = Dataset(samples=samples,
366 labels=range(16), chunks=[0]*16)
367 self.failUnlessEqual( data.samples.mean(), 2.0 )
368 self.failUnlessEqual( data.samples.std(), 1.0 )
369 zscore(data, perchunk=True)
370
371
372 check = N.array([-2,-1,1,2,0,0,1,-1,-1,1,1,-1,0,0,0,0],
373 dtype='float64').reshape(16,1)
374 self.failUnless( (data.samples == check).all() )
375
376 data = Dataset(samples=samples,
377 labels=range(16), chunks=[0]*16)
378 zscore(data, perchunk=False)
379 self.failUnless( (data.samples == check).all() )
380
381
382 data = Dataset(samples=samples,
383 labels=[0, 2, 2, 2, 1] + [2]*11,
384 chunks=[0]*16)
385 zscore(data, baselinelabels=[0, 1])
386 self.failUnless((samples == data.samples+1.0).all())
387
388
390 data = Dataset(samples=N.arange( 20 ).reshape( (4,5) ),
391 labels=1,
392 chunks=1)
393
394 ag_data = aggregateFeatures(data, N.mean)
395
396 self.failUnless(ag_data.nsamples == 4)
397 self.failUnless(ag_data.nfeatures == 1)
398 self.failUnless((ag_data.samples[:,0] == [2, 7, 12, 17]).all())
399
400
402 """Test creation of new dataset by applying a mapper"""
403 mapper = MaskMapper(N.array([1,0,1]))
404 dataset = Dataset(samples=N.arange(12).reshape( (4,3) ),
405 labels=1,
406 chunks=1)
407 seldataset = dataset.applyMapper(featuresmapper=mapper)
408 self.failUnless( (dataset.selectFeatures([0, 2]).samples
409 == seldataset.samples).all() )
410
411
412
413 if __debug__:
414
415 self.failUnlessRaises(ValueError, mapper.reverse, [10,20,30])
416 self.failUnlessRaises(ValueError, mapper.forward, [10,20])
417
418
419
420
421
422
423
424
426 """Test Dataset.idhash() if it gets changed if any of the labels/chunks changes"""
427
428 dataset = Dataset(samples=N.arange(12).reshape( (4,3) ),
429 labels=1,
430 chunks=1)
431 origid = dataset.idhash
432 dataset.labels = [3, 1, 2, 3]
433 self.failUnless(origid != dataset.idhash,
434 msg="Changing all labels should alter dataset's idhash")
435
436 origid = dataset.idhash
437
438 z = dataset.labels[1]
439 self.failUnlessEqual(origid, dataset.idhash,
440 msg="Accessing shouldn't change idhash")
441 z = dataset.chunks
442 self.failUnlessEqual(origid, dataset.idhash,
443 msg="Accessing shouldn't change idhash")
444 z[2] = 333
445 self.failUnless(origid != dataset.idhash,
446 msg="Changing value in attribute should change idhash")
447
448 origid = dataset.idhash
449 dataset.samples[1,1] = 1000
450 self.failUnless(origid != dataset.idhash,
451 msg="Changing value in data should change idhash")
452
453
454 origid = dataset.idhash
455 dataset.permuteLabels(True)
456 self.failUnless(origid != dataset.idhash,
457 msg="Permutation also changes idhash")
458
459 dataset.permuteLabels(False)
460 self.failUnless(origid == dataset.idhash,
461 msg="idhash should be restored after permuteLabels(False)")
462
463
480
481
488
489
491 od = {'apple':0, 'orange':1}
492 samples = [[3],[2],[3]]
493 labels_l = ['apple', 'orange', 'apple']
494
495
496 ds = Dataset(samples=samples, labels='orange')
497 self.failUnless(N.all(ds.labels == ['orange']*3))
498
499
500 for ds in [Dataset(samples=samples, labels=labels_l, labels_map=od),
501
502 Dataset(samples=samples, labels=labels_l, labels_map=True)]:
503 self.failUnless(N.all(ds.labels == [0, 1, 0]))
504 self.failUnless(ds.labels_map == od)
505 ds_ = ds[1]
506 self.failUnless(ds_.labels_map == od,
507 msg='selectSamples should provide full mapping preserved')
508
509
510 self.failUnlessRaises(ValueError, Dataset, samples=samples,
511 labels=labels_l, labels_map = {'apple':0})
512
513
514
515 ds2 = Dataset(samples=samples, labels=labels_l)
516 self.failUnlessEqual(ds2.labels_map, None)
517
518
519 od3 = {1:100, 2:101, 3:100}
520 ds3 = Dataset(samples=samples, labels=[1,2,3],
521 labels_map = od3)
522 self.failUnlessEqual(ds3.labels_map, od3)
523 self.failUnless(N.all(ds3.labels == [100, 101, 100]))
524
525 ds3_ = ds3[1]
526 self.failUnlessEqual(ds3.labels_map, od3)
527
528 ds4 = Dataset(samples=samples, labels=labels_l)
529
530
531 ds = Dataset(samples=samples, labels=labels_l, labels_map=od)
532
533 self.failUnlessRaises(ValueError, ds.setLabelsMap,
534 {'orange': 1, 'nonorange': 3})
535 new_map = {'tasty':0, 'crappy':1}
536 ds.labels_map = new_map.copy()
537 self.failUnlessEqual(ds.labels_map, new_map)
538
539
541 """Adding datasets needs special care whenever labels mapping
542 is used."""
543 samples = [[3],[2],[3]]
544 l1 = ['a', 'b', 'a']
545 l2 = ['b', 'a', 'c']
546 ds1 = Dataset(samples=samples, labels=l1,
547 labels_map={'a':1, 'b':2})
548 ds2 = Dataset(samples=samples, labels=l2,
549 labels_map={'c':1, 'a':4, 'b':2})
550
551
552 ds0 = Dataset(samples=samples, labels=l2)
553
554
555 lm1 = ds1.labels_map.copy()
556 lm2 = ds2.labels_map.copy()
557
558 ds3 = ds1 + ds2
559 self.failUnless(N.all(ds3.labels ==
560 N.hstack((ds1.labels, [2, 1, 5]))))
561 self.failUnless(ds1.labels_map == lm1)
562 self.failUnless(ds2.labels_map == lm2)
563
564
565 ds1 += ds2
566 self.failUnless(N.all(ds1.labels == ds3.labels))
567
568
569 self.failUnless(N.all(ds1.labels_map == ds3.labels_map))
570
571
572
573 self.failUnlessRaises(ValueError, ds1.__add__, ds0)
574 self.failUnlessRaises(ValueError, ds1.__iadd__, ds0)
575
576
578
579 ds = datasets['uni2small']
580
581 ds_ = ds.copy()
582
583 self.failUnless(N.all(ds.samples == ds_.samples))
584 self.failUnless(N.all(ds.labels == ds_.labels))
585 self.failUnless(N.all(ds.chunks == ds_.chunks))
586
587
588 ds_.samples[0,0] = 1234
589 self.failUnless(N.any(ds.samples != ds_.samples))
590 self.failUnless(N.all(ds.labels == ds_.labels))
591 self.failUnless(N.all(ds.chunks == ds_.chunks))
592
593 ds_.labels = N.hstack(([123], ds_.labels[1:]))
594 self.failUnless(N.any(ds.samples != ds_.samples))
595 self.failUnless(N.any(ds.labels != ds_.labels))
596 self.failUnless(N.all(ds.chunks == ds_.chunks))
597
598 ds_.chunks = N.hstack(([1234], ds_.chunks[1:]))
599 self.failUnless(N.any(ds.samples != ds_.samples))
600 self.failUnless(N.any(ds.labels != ds_.labels))
601 self.failUnless(N.any(ds.chunks != ds_.chunks))
602
603 self.failUnless(N.any(ds.uniquelabels != ds_.uniquelabels))
604 self.failUnless(N.any(ds.uniquechunks != ds_.uniquechunks))
605
606
607
610
611
612 if __name__ == '__main__':
613 import runner
614