1
2
3
4
5
6
7
8
9 """Dataset container"""
10
11 __docformat__ = 'restructuredtext'
12
13 import operator
14 import random
15 import mvpa.support.copy as copy
16 import numpy as N
17
18 from sets import Set
19
20
21
22
23
24
25 from mvpa.misc.exceptions import DatasetError
26 from mvpa.misc.support import idhash as idhash_
27 from mvpa.base.dochelpers import enhancedDocString, table2string
28
29 if __debug__:
30 from mvpa.base import debug, warning
31
33 """Helper function to validate that seq contains unique sorted values
34 """
35 if operator.isSequenceType(seq):
36 seq_unique = N.unique(seq)
37 if len(seq) != len(seq_unique):
38 warning("%s() operates only with indexes for %s without"
39 " repetitions. Repetitions were removed."
40 % (fname, item))
41 if N.any(N.sort(seq) != seq_unique):
42 warning("%s() does not guarantee the original order"
43 " of selected %ss. Use selectSamples() and "
44 " selectFeatures(sort=False) instead" % (fname, item))
45
46
47
49 """*The* Dataset.
50
51 This class provides a container to store all necessary data to
52 perform MVPA analyses. These are the data samples, as well as the
53 labels associated with the samples. Additionally, samples can be
54 grouped into chunks.
55
56 :Groups:
57 - `Creators`: `__init__`, `selectFeatures`, `selectSamples`,
58 `applyMapper`
59 - `Mutators`: `permuteLabels`
60
61 Important: labels assumed to be immutable, i.e. noone should modify
62 them externally by accessing indexed items, ie something like
63 ``dataset.labels[1] += "_bad"`` should not be used. If a label has
64 to be modified, full copy of labels should be obtained, operated on,
65 and assigned back to the dataset, otherwise dataset.uniquelabels
66 would not work. The same applies to any other attribute which has
67 corresponding unique* access property.
68
69 """
70
71
72
73
74
75
76
77
78
79
80
81
82
83 _uniqueattributes = []
84 """Unique attributes associated with the data"""
85
86 _registeredattributes = []
87 """Registered attributes (stored in _data)"""
88
89 _requiredattributes = ['samples', 'labels']
90 """Attributes which have to be provided to __init__, or otherwise
91 no default values would be assumed and construction of the
92 instance would fail"""
93
94
95
96
97
98
99
100
101
102
103 - def __init__(self,
104
105 data=None,
106 dsattr=None,
107
108 dtype=None,
109
110 samples=None,
111 labels=None,
112 labels_map=None,
113 chunks=None,
114 origids=None,
115
116 check_data=True,
117 copy_samples=False,
118 copy_data=True,
119 copy_dsattr=True):
120 """Initialize dataset instance
121
122 There are basically two different way to create a dataset:
123
124 1. Create a new dataset from samples and sample attributes. In
125 this mode a two-dimensional `ndarray` has to be passed to the
126 `samples` keyword argument and the corresponding samples
127 attributes are provided via the `labels` and `chunks`
128 arguments.
129
130 2. Copy contructor mode
131 The second way is used internally to perform quick coyping
132 of datasets, e.g. when performing feature selection. In this
133 mode and the two dictionaries (`data` and `dsattr`) are
134 required. For performance reasons this mode bypasses most of
135 the sanity check performed by the previous mode, as for
136 internal operations data integrity is assumed.
137
138
139 :Parameters:
140 data : dict
141 Dictionary with an arbitrary number of entries. The value for
142 each key in the dict has to be an ndarray with the
143 same length as the number of rows in the samples array.
144 A special entry in this dictionary is 'samples', a 2d array
145 (samples x features). A shallow copy is stored in the object.
146 dsattr : dict
147 Dictionary of dataset attributes. An arbitrary number of
148 arbitrarily named and typed objects can be stored here. A
149 shallow copy of the dictionary is stored in the object.
150 dtype: type | None
151 If None -- do not change data type if samples
152 is an ndarray. Otherwise convert samples to dtype.
153
154
155 :Keywords:
156 samples : ndarray
157 2d array (samples x features)
158 labels
159 An array or scalar value defining labels for each samples
160 labels_map : None or bool or dict
161 Map from labels into literal names. If is None or True,
162 the mapping is computed, from labels which must be literal.
163 If is False, no mapping is computed. If dict -- mapping is
164 verified and taken, labels get remapped. Dict must map
165 literal -> number
166 chunks
167 An array or scalar value defining chunks for each sample
168
169 Each of the Keywords arguments overwrites what is/might be
170 already in the `data` container.
171
172 """
173
174
175
176
177 if data is None:
178 data = {}
179 if dsattr is None:
180 dsattr = {}
181
182
183
184
185 if copy_data:
186
187
188
189
190 lcl_data = data.copy()
191 for k, v in data.iteritems():
192
193 if k == 'samples' and not copy_samples:
194 continue
195 lcl_data[k] = v.copy()
196 else:
197
198
199
200 lcl_data = data.copy()
201
202 if copy_dsattr and len(dsattr)>0:
203
204 if __debug__:
205 debug('DS', "Deep copying dsattr %s" % `dsattr`)
206 lcl_dsattr = copy.deepcopy(dsattr)
207
208 else:
209
210 lcl_dsattr = copy.copy(dsattr)
211
212
213
214
215 self._data = lcl_data
216 """What makes a dataset."""
217
218 self._dsattr = lcl_dsattr
219 """Dataset attriibutes."""
220
221
222 if not samples == None:
223 if __debug__:
224 if lcl_data.has_key('samples'):
225 debug('DS',
226 "`Data` dict has `samples` (%s) but there is also" \
227 " __init__ parameter `samples` which overrides " \
228 " stored in `data`" % (`lcl_data['samples'].shape`))
229 lcl_data['samples'] = self._shapeSamples(samples, dtype,
230 copy_samples)
231
232
233
234
235
236 if not labels == None:
237 if __debug__:
238 if lcl_data.has_key('labels'):
239 debug('DS',
240 "`Data` dict has `labels` (%s) but there is also" +
241 " __init__ parameter `labels` which overrides " +
242 " stored in `data`" % (`lcl_data['labels']`))
243 if lcl_data.has_key('samples'):
244 lcl_data['labels'] = \
245 self._expandSampleAttribute(labels, 'labels')
246
247
248 for attr in self._requiredattributes:
249 if not lcl_data.has_key(attr):
250 raise DatasetError, \
251 "Attribute %s is required to initialize dataset" % \
252 attr
253
254 nsamples = self.nsamples
255
256
257 if not chunks == None:
258 lcl_data['chunks'] = \
259 self._expandSampleAttribute(chunks, 'chunks')
260 elif not lcl_data.has_key('chunks'):
261
262
263 lcl_data['chunks'] = N.arange(nsamples)
264
265
266 if not origids is None:
267
268 lcl_data['origids'] = origids
269 elif not lcl_data.has_key('origids'):
270
271 lcl_data['origids'] = N.arange(len(lcl_data['labels']))
272 else:
273
274
275
276 pass
277
278
279 for attr in self._registeredattributes:
280 if not lcl_data.has_key(attr):
281 if __debug__:
282 debug("DS", "Initializing attribute %s" % attr)
283 lcl_data[attr] = N.zeros(nsamples)
284
285
286 labels_ = N.asarray(lcl_data['labels'])
287 labels_map_known = lcl_dsattr.has_key('labels_map')
288 if labels_map is True:
289
290 if labels_.dtype.char == 'S' or not labels_map_known:
291
292 ulabels = list(Set(labels_))
293 ulabels.sort()
294 labels_map = dict([ (x[1], x[0]) for x in enumerate(ulabels) ])
295 if __debug__:
296 debug('DS', 'Mapping for the labels computed to be %s'
297 % labels_map)
298 else:
299 if __debug__:
300 debug('DS', 'Mapping of labels was requested but labels '
301 'are not strings. Skipped')
302 labels_map = None
303 pass
304 elif labels_map is False:
305 labels_map = None
306
307 if isinstance(labels_map, dict):
308 if labels_map_known:
309 if __debug__:
310 debug('DS',
311 "`dsattr` dict has `labels_map` (%s) but there is also" \
312 " __init__ parameter `labels_map` (%s) which overrides " \
313 " stored in `dsattr`" % (lcl_dsattr['labels_map'], labels_map))
314
315 lcl_dsattr['labels_map'] = labels_map
316
317 if labels_.dtype.char == 'S' or not labels_map_known:
318 if __debug__:
319 debug('DS_', "Remapping labels using mapping %s" % labels_map)
320
321
322 try:
323 lcl_data['labels'] = N.array(
324 [labels_map[x] for x in lcl_data['labels']])
325 except KeyError, e:
326 raise ValueError, "Provided labels_map %s is insufficient " \
327 "to map all the labels. Mapping for label %s is " \
328 "missing" % (labels_map, e)
329
330 elif not lcl_dsattr.has_key('labels_map'):
331 lcl_dsattr['labels_map'] = labels_map
332 elif __debug__:
333 debug('DS_', 'Not overriding labels_map in dsattr since it has one')
334
335 if check_data:
336 self._checkData()
337
338
339
340
341
342
343
344 if not labels is None or not chunks is None:
345
346
347 lcl_dsattr['__uniquereseted'] = False
348 self._resetallunique(force=True)
349
350
351 __doc__ = enhancedDocString('Dataset', locals())
352
353
354 @property
356 """To verify if dataset is in the same state as when smth else was done
357
358 Like if classifier was trained on the same dataset as in question"""
359
360 _data = self._data
361 res = idhash_(_data)
362
363
364
365
366 keys = _data.keys()
367 keys.sort()
368 for k in keys:
369 res += idhash_(_data[k])
370 return res
371
372
374 """Set to None all unique* attributes of corresponding dictionary
375 """
376 _dsattr = self._dsattr
377
378 if not force and _dsattr['__uniquereseted']:
379 return
380
381 _uniqueattributes = self._uniqueattributes
382
383 if __debug__ and "DS_" in debug.active:
384 debug("DS_", "Reseting all attributes %s for dataset %s"
385 % (_uniqueattributes,
386 self.summary(uniq=False, idhash=False,
387 stats=False, lstats=False)))
388
389
390 for k in _uniqueattributes:
391 _dsattr[k] = None
392 _dsattr['__uniquereseted'] = True
393
394
396 """Provide common facility to return unique attributes
397
398 XXX `dict_` can be simply replaced now with self._dsattr
399 """
400
401
402 _dsattr = self._dsattr
403
404 if not _dsattr.has_key(attrib) or _dsattr[attrib] is None:
405 if __debug__ and 'DS_' in debug.active:
406 debug("DS_", "Recomputing unique set for attrib %s within %s" %
407 (attrib, self.summary(uniq=False,
408 stats=False, lstats=False)))
409
410
411 _dsattr[attrib] = N.unique( N.asanyarray(dict_[attrib[6:]]) )
412 assert(not _dsattr[attrib] is None)
413 _dsattr['__uniquereseted'] = False
414
415 return _dsattr[attrib]
416
417
419 """Provide common facility to set attributes
420
421 """
422 if len(value) != self.nsamples:
423 raise ValueError, \
424 "Provided %s have %d entries while there is %d samples" % \
425 (attrib, len(value), self.nsamples)
426 self._data[attrib] = N.asarray(value)
427 uniqueattr = "unique" + attrib
428
429 _dsattr = self._dsattr
430 if _dsattr.has_key(uniqueattr):
431 _dsattr[uniqueattr] = None
432
433
435 """Returns the number of samples per unique label.
436 """
437
438 _data = self._data
439
440
441 uniqueattr = self._getuniqueattr(attrib="unique" + attrib,
442 dict_=_data)
443
444
445 result = dict(zip(uniqueattr, [ 0 ] * len(uniqueattr)))
446 for l in _data[attrib]:
447 result[l] += 1
448
449
450
451
452 return result
453
454
457 """Return indecies of samples given a list of attributes
458 """
459
460 if not operator.isSequenceType(values) \
461 or isinstance(values, basestring):
462 values = [ values ]
463
464
465
466 sel = N.array([], dtype=N.int16)
467 _data = self._data
468 for value in values:
469 sel = N.concatenate((
470 sel, N.where(_data[attrib]==value)[0]))
471
472 if sort:
473
474 sel.sort()
475
476 return sel
477
478
479 - def idsonboundaries(self, prior=0, post=0,
480 attributes_to_track=['labels', 'chunks'],
481 affected_labels=None,
482 revert=False):
483 """Find samples which are on the boundaries of the blocks
484
485 Such samples might need to be removed. By default (with
486 prior=0, post=0) ids of the first samples in a 'block' are
487 reported
488
489 :Parameters:
490 prior : int
491 how many samples prior to transition sample to include
492 post : int
493 how many samples post the transition sample to include
494 attributes_to_track : list of basestring
495 which attributes to track to decide on the boundary condition
496 affected_labels : list of basestring
497 for which labels to perform selection. If None - for all
498 revert : bool
499 either to revert the meaning and provide ids of samples which are found
500 to not to be boundary samples
501 """
502
503 _data = self._data
504 labels = self.labels
505 nsamples = self.nsamples
506
507 lastseen = [None for attr in attributes_to_track]
508 transitions = []
509
510 for i in xrange(nsamples):
511 current = [_data[attr][i] for attr in attributes_to_track]
512 if lastseen != current:
513
514 new_transitions = range(max(0, i-prior),
515 min(nsamples-1, i+post)+1)
516 if affected_labels is not None:
517 new_transitions = filter(lambda i: labels[i] in affected_labels,
518 new_transitions)
519 transitions += new_transitions
520 lastseen = current
521
522 transitions = Set(transitions)
523 if revert:
524 transitions = Set(range(nsamples)).difference(transitions)
525
526
527 transitions = N.array(list(transitions))
528 transitions.sort()
529 return list(transitions)
530
531
533 """Adapt different kinds of samples
534
535 Handle all possible input value for 'samples' and tranform
536 them into a 2d (samples x feature) representation.
537 """
538
539
540 if (not isinstance(samples, N.ndarray)):
541
542
543 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy)
544 else:
545 if samples.ndim < 2 \
546 or (not dtype is None and dtype != samples.dtype):
547 if dtype is None:
548 dtype = samples.dtype
549 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy)
550 elif copy:
551 samples = samples.copy()
552
553
554 if len(samples.shape) > 2:
555 raise DatasetError, "Only (samples x features) -> 2d sample " \
556 + "are supported (got %s shape of samples)." \
557 % (`samples.shape`) \
558 +" Consider MappedDataset if applicable."
559
560 return samples
561
562
564 """Checks `_data` members to have the same # of samples.
565 """
566
567
568
569
570
571
572 nsamples = self.nsamples
573 _data = self._data
574
575 for k, v in _data.iteritems():
576 if not len(v) == nsamples:
577 raise DatasetError, \
578 "Length of sample attribute '%s' [%i] does not " \
579 "match the number of samples in the dataset [%i]." \
580 % (k, len(v), nsamples)
581
582
583 uniques = N.unique(_data['origids'])
584 uniques.sort()
585
586 sorted_ids = _data['origids'].copy()
587 sorted_ids.sort()
588
589 if not (uniques == sorted_ids).all():
590 raise DatasetError, "Samples IDs are not unique."
591
592
594 """If a sample attribute is given as a scalar expand/repeat it to a
595 length matching the number of samples in the dataset.
596 """
597 try:
598
599
600 if isinstance(attr, basestring):
601 raise TypeError
602 if len(attr) != self.nsamples:
603 raise DatasetError, \
604 "Length of sample attribute '%s' [%d]" \
605 % (attr_name, len(attr)) \
606 + " has to match the number of samples" \
607 + " [%d]." % self.nsamples
608
609 return N.array(attr)
610
611 except TypeError:
612
613
614 return N.repeat(attr, self.nsamples)
615
616
617 @classmethod
619 """Register an attribute for any Dataset class.
620
621 Creates property assigning getters/setters depending on the
622 availability of corresponding _get, _set functions.
623 """
624 classdict = cls.__dict__
625 if not classdict.has_key(key):
626 if __debug__:
627 debug("DS", "Registering new attribute %s" % key)
628
629
630 getter = '_get%s' % key
631 if classdict.has_key(getter):
632 getter = '%s.%s' % (cls.__name__, getter)
633 else:
634 getter = "lambda x: x.%s['%s']" % (dictname, key)
635
636
637
638 setter = '_set%s' % key
639 if classdict.has_key(setter):
640 setter = '%s.%s' % (cls.__name__, setter)
641 elif dictname=="_data":
642 setter = "lambda self,x: self._setdataattr" + \
643 "(attrib='%s', value=x)" % (key)
644 else:
645 setter = None
646
647 if __debug__:
648 debug("DS", "Registering new property %s.%s" %
649 (cls.__name__, key))
650 exec "%s.%s = property(fget=%s, fset=%s)" % \
651 (cls.__name__, key, getter, setter)
652
653 if abbr is not None:
654 exec "%s.%s = property(fget=%s, fset=%s)" % \
655 (cls.__name__, abbr, getter, setter)
656
657 if hasunique:
658 uniquekey = "unique%s" % key
659 getter = '_get%s' % uniquekey
660 if classdict.has_key(getter):
661 getter = '%s.%s' % (cls.__name__, getter)
662 else:
663 getter = "lambda x: x._getuniqueattr" + \
664 "(attrib='%s', dict_=x.%s)" % (uniquekey, dictname)
665
666 if __debug__:
667 debug("DS", "Registering new property %s.%s" %
668 (cls.__name__, uniquekey))
669
670 exec "%s.%s = property(fget=%s)" % \
671 (cls.__name__, uniquekey, getter)
672 if abbr is not None:
673 exec "%s.U%s = property(fget=%s)" % \
674 (cls.__name__, abbr, getter)
675
676
677 sampleskey = "samplesper%s" % key[:-1]
678 if __debug__:
679 debug("DS", "Registering new property %s.%s" %
680 (cls.__name__, sampleskey))
681
682 exec "%s.%s = property(fget=%s)" % \
683 (cls.__name__, sampleskey,
684 "lambda x: x._getNSamplesPerAttr(attrib='%s')" % key)
685
686 cls._uniqueattributes.append(uniquekey)
687
688
689 sampleskey = "idsby%s" % key
690 if __debug__:
691 debug("DS", "Registering new property %s.%s" %
692 (cls.__name__, sampleskey))
693
694 exec "%s.%s = %s" % (cls.__name__, sampleskey,
695 "lambda self, x: " +
696 "self._getSampleIdsByAttr(x,attrib='%s')" % key)
697
698 cls._uniqueattributes.append(uniquekey)
699
700 cls._registeredattributes.append(key)
701 elif __debug__:
702 warning('Trying to reregister attribute `%s`. For now ' % key +
703 'such capability is not present')
704
705
714
715
717 return "<%s>" % str(self)
718
719
720 - def summary(self, uniq=True, stats=True, idhash=False, lstats=True,
721 maxc=30, maxl=20):
722 """String summary over the object
723
724 :Parameters:
725 uniq : bool
726 Include summary over data attributes which have unique
727 idhash : bool
728 Include idhash value for dataset and samples
729 stats : bool
730 Include some basic statistics (mean, std, var) over dataset samples
731 lstats : bool
732 Include statistics on chunks/labels
733 maxc : int
734 Maximal number of chunks when provide details on labels/chunks
735 maxl : int
736 Maximal number of labels when provide details on labels/chunks
737 """
738
739 samples = self.samples
740 _data = self._data
741 _dsattr = self._dsattr
742
743 if idhash:
744 idhash_ds = "{%s}" % self.idhash
745 idhash_samples = "{%s}" % idhash_(samples)
746 else:
747 idhash_ds = ""
748 idhash_samples = ""
749
750 s = """Dataset %s/ %s %d%s x %d""" % \
751 (idhash_ds, samples.dtype,
752 self.nsamples, idhash_samples, self.nfeatures)
753
754 ssep = (' ', '\n')[lstats]
755 if uniq:
756 s += "%suniq:" % ssep
757 for uattr in _dsattr.keys():
758 if not uattr.startswith("unique"):
759 continue
760 attr = uattr[6:]
761 try:
762 value = self._getuniqueattr(attrib=uattr,
763 dict_=_data)
764 s += " %d %s" % (len(value), attr)
765 except:
766 pass
767
768 if isinstance(self.labels_map, dict):
769 s += ' labels_mapped'
770
771 if stats:
772
773
774
775 s += "%sstats: mean=%g std=%g var=%g min=%g max=%g\n" % \
776 (ssep, N.mean(samples), N.std(samples),
777 N.var(samples), N.min(samples), N.max(samples))
778
779 if lstats:
780 s += self.summary_labels(maxc=maxc, maxl=maxl)
781
782 return s
783
784
786 """Provide summary statistics over the labels and chunks
787
788 :Parameters:
789 maxc : int
790 Maximal number of chunks when provide details
791 maxl : int
792 Maximal number of labels when provide details
793 """
794
795
796 from mvpa.datasets.miscfx import getSamplesPerChunkLabel
797 spcl = getSamplesPerChunkLabel(self)
798
799 ul = self.uniquelabels.tolist()
800 uc = self.uniquechunks.tolist()
801 s = ""
802 if len(ul) < maxl and len(uc) < maxc:
803 s += "\nCounts of labels in each chunk:"
804
805 table = [[' chunks\labels'] + ul]
806 table += [[''] + ['---'] * len(ul)]
807 for c, counts in zip(uc, spcl):
808 table.append([ str(c) ] + counts.tolist())
809 s += '\n' + table2string(table)
810 else:
811 s += "No details due to large number of labels or chunks. " \
812 "Increase maxc and maxl if desired"
813
814 labels_map = self.labels_map
815 if isinstance(labels_map, dict):
816 s += "\nOriginal labels were mapped using following mapping:"
817 s += '\n\t'+'\n\t'.join([':\t'.join(map(str, x))
818 for x in labels_map.items()]) + '\n'
819
820 def cl_stats(axis, u, name1, name2):
821 """ Compute statistics per label
822 """
823 stats = {'min': N.min(spcl, axis=axis),
824 'max': N.max(spcl, axis=axis),
825 'mean': N.mean(spcl, axis=axis),
826 'std': N.std(spcl, axis=axis),
827 '#%ss' % name2: N.sum(spcl>0, axis=axis)}
828 entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%ss' % name2]
829 table = [ entries ]
830 for i, l in enumerate(u):
831 d = {' ' + name1 : l}
832 d.update(dict([ (k, stats[k][i]) for k in stats.keys()]))
833 table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)]
834 % d[e] for e in entries] )
835 return '\nSummary per %s across %ss\n' % (name1, name2) \
836 + table2string(table)
837
838 if len(ul) < maxl:
839 s += cl_stats(0, ul, 'label', 'chunk')
840 if len(uc) < maxc:
841 s += cl_stats(1, uc, 'chunk', 'label')
842 return s
843
844
846 """Merge the samples of one Dataset object to another (in-place).
847
848 No dataset attributes, besides labels_map, will be merged!
849 Additionally, a new set of unique `origids` will be generated.
850 """
851
852 _data = self._data
853 other_data = other._data
854
855 if not self.nfeatures == other.nfeatures:
856 raise DatasetError, "Cannot add Dataset, because the number of " \
857 "feature do not match."
858
859
860 slm = self.labels_map
861 olm = other.labels_map
862 if N.logical_xor(slm is None, olm is None):
863 raise ValueError, "Cannot add datasets where only one of them " \
864 "has labels map assigned. If needed -- implement it"
865
866
867 for k,v in _data.iteritems():
868 if k == 'origids':
869
870
871
872 _data[k] = N.arange(len(v) + len(other_data[k]))
873
874 elif k == 'labels' and slm is not None:
875
876
877
878 nlm = slm.copy()
879
880 nextid = N.sort(nlm.values())[-1] + 1
881 olabels = other.labels
882 olabels_remap = {}
883 for ol, olnum in olm.iteritems():
884 if not nlm.has_key(ol):
885
886
887
888 if olnum in nlm.values():
889 nextid = N.sort(nlm.values() + olm.values())[-1] + 1
890 else:
891 nextid = olnum
892 olabels_remap[olnum] = nextid
893 nlm[ol] = nextid
894 nextid += 1
895 else:
896 olabels_remap[olnum] = nlm[ol]
897 olabels = [olabels_remap[x] for x in olabels]
898
899 _data['labels'] = N.concatenate((v, olabels), axis=0)
900
901 self._dsattr['labels_map'] = nlm
902
903 if __debug__:
904
905
906
907 if (len(Set(slm.keys())) != len(Set(slm.values()))) or \
908 (len(Set(olm.keys())) != len(Set(olm.values()))):
909 warning("Adding datasets where multiple labels "
910 "mapped to the same ID is not recommended. "
911 "Please check the outcome. Original mappings "
912 "were %s and %s. Resultant is %s"
913 % (slm, olm, nlm))
914
915 else:
916 _data[k] = N.concatenate((v, other_data[k]), axis=0)
917
918
919 self._resetallunique()
920
921 return self
922
923
925 """Merge the samples two Dataset objects.
926
927 All data of both datasets is copied, concatenated and a new Dataset is
928 returned.
929
930 NOTE: This can be a costly operation (both memory and time). If
931 performance is important consider the '+=' operator.
932 """
933
934 out = super(Dataset, self).__new__(self.__class__)
935
936
937
938 out.__init__(data=self._data,
939 dsattr=self._dsattr,
940 copy_samples=True,
941 copy_data=True,
942 copy_dsattr=True)
943
944 out += other
945
946 return out
947
948
950 """Create a copy (clone) of the dataset, by fully copying current one
951
952 """
953
954 out = super(Dataset, self).__new__(self.__class__)
955
956
957
958 out.__init__(data=self._data,
959 dsattr=self._dsattr,
960 copy_samples=True,
961 copy_data=True,
962 copy_dsattr=True)
963
964 return out
965
966
968 """Select a number of features from the current set.
969
970 :Parameters:
971 ids
972 iterable container to select ids
973 sort : bool
974 if to sort Ids. Order matters and `selectFeatures` assumes
975 incremental order. If not such, in non-optimized code
976 selectFeatures would verify the order and sort
977
978 Returns a new Dataset object with a view of the original
979 samples array (no copying is performed).
980
981 WARNING: The order of ids determines the order of features in
982 the returned dataset. This might be useful sometimes, but can
983 also cause major headaches! Order would is verified when
984 running in non-optimized code (if __debug__)
985 """
986 if ids is None and groups is None:
987 raise ValueError, "No feature selection specified."
988
989
990 if ids is None:
991 ids = []
992
993 if not groups is None:
994 if not self._dsattr.has_key('featuregroups'):
995 raise RuntimeError, \
996 "Dataset has no feature grouping information."
997
998 for g in groups:
999 ids += (self._dsattr['featuregroups'] == g).nonzero()[0].tolist()
1000
1001
1002
1003 if sort:
1004 ids.sort()
1005 elif __debug__ and 'CHECK_DS_SORTED' in debug.active:
1006 from mvpa.misc.support import isSorted
1007 if not isSorted(ids):
1008 warning("IDs for selectFeatures must be provided " +
1009 "in sorted order, otherwise major headache might occur")
1010
1011
1012 new_data = self._data.copy()
1013
1014
1015
1016 new_data['samples'] = self._data['samples'][:, ids]
1017
1018
1019 if self._dsattr.has_key('featuregroups'):
1020 new_dsattr = self._dsattr.copy()
1021 new_dsattr['featuregroups'] = self._dsattr['featuregroups'][ids]
1022 else:
1023 new_dsattr = self._dsattr
1024
1025
1026 dataset = super(Dataset, self).__new__(self.__class__)
1027
1028
1029
1030 dataset.__init__(data=new_data,
1031 dsattr=new_dsattr,
1032 check_data=False,
1033 copy_samples=False,
1034 copy_data=False,
1035 copy_dsattr=False
1036 )
1037
1038 return dataset
1039
1040
1041 - def applyMapper(self, featuresmapper=None, samplesmapper=None,
1042 train=True):
1043 """Obtain new dataset by applying mappers over features and/or samples.
1044
1045 While featuresmappers leave the sample attributes information
1046 unchanged, as the number of samples in the dataset is invariant,
1047 samplesmappers are also applied to the samples attributes themselves!
1048
1049 Applying a featuresmapper will destroy any feature grouping information.
1050
1051 :Parameters:
1052 featuresmapper : Mapper
1053 `Mapper` to somehow transform each sample's features
1054 samplesmapper : Mapper
1055 `Mapper` to transform each feature across samples
1056 train : bool
1057 Flag whether to train the mapper with this dataset before applying
1058 it.
1059
1060 TODO: selectFeatures is pretty much
1061 applyMapper(featuresmapper=MaskMapper(...))
1062 """
1063
1064
1065 new_data = self._data.copy()
1066
1067
1068
1069 if samplesmapper:
1070 if __debug__:
1071 debug("DS", "Training samplesmapper %s" % `samplesmapper`)
1072 samplesmapper.train(self)
1073
1074 if __debug__:
1075 debug("DS", "Applying samplesmapper %s" % `samplesmapper` +
1076 " to samples of dataset `%s`" % `self`)
1077
1078
1079
1080 if new_data.has_key('origids'):
1081 del(new_data['origids'])
1082
1083
1084 for k in new_data.keys():
1085 new_data[k] = samplesmapper.forward(self._data[k])
1086
1087
1088
1089 new_dsattr = self._dsattr
1090
1091 if featuresmapper:
1092 if __debug__:
1093 debug("DS", "Training featuresmapper %s" % `featuresmapper`)
1094 featuresmapper.train(self)
1095
1096 if __debug__:
1097 debug("DS", "Applying featuresmapper %s" % `featuresmapper` +
1098 " to samples of dataset `%s`" % `self`)
1099 new_data['samples'] = featuresmapper.forward(self._data['samples'])
1100
1101
1102
1103 if self._dsattr.has_key('featuregroups'):
1104 new_dsattr = self._dsattr.copy()
1105 del(new_dsattr['featuregroups'])
1106 else:
1107 new_dsattr = self._dsattr
1108
1109
1110 dataset = super(Dataset, self).__new__(self.__class__)
1111
1112
1113
1114 dataset.__init__(data=new_data,
1115 dsattr=new_dsattr,
1116 check_data=False,
1117 copy_samples=False,
1118 copy_data=False,
1119 copy_dsattr=False
1120 )
1121
1122
1123 if samplesmapper:
1124 dataset._resetallunique(force=True)
1125
1126 return dataset
1127
1128
1130 """Choose a subset of samples defined by samples IDs.
1131
1132 Returns a new dataset object containing the selected sample
1133 subset.
1134
1135 TODO: yoh, we might need to sort the mask if the mask is a
1136 list of ids and is not ordered. Clarify with Michael what is
1137 our intent here!
1138 """
1139
1140
1141 if not operator.isSequenceType( ids ):
1142 ids = [ids]
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161 data = {}
1162 for k, v in self._data.iteritems():
1163 data[k] = v[ids, ]
1164
1165
1166 dataset = super(Dataset, self).__new__(self.__class__)
1167
1168
1169
1170 dataset.__init__(data=data,
1171 dsattr=self._dsattr,
1172 check_data=False,
1173 copy_samples=False,
1174 copy_data=False,
1175 copy_dsattr=False)
1176
1177 dataset._resetallunique(force=True)
1178 return dataset
1179
1180
1181
1182 - def index(self, *args, **kwargs):
1183 """Universal indexer to obtain indexes of interesting samples/features.
1184 See .select() for more information
1185
1186 :Return: tuple of (samples indexes, features indexes). Each
1187 item could be also None, if no selection on samples or
1188 features was requested (to discriminate between no selected
1189 items, and no selections)
1190 """
1191 s_indx = []
1192 f_indx = []
1193 return_dataset = kwargs.pop('return_dataset', False)
1194 largs = len(args)
1195
1196 args = list(args)
1197
1198 largs_nonstring = 0
1199
1200 for i in xrange(largs):
1201 l = args[i]
1202 if isinstance(l, basestring):
1203 if l.lower() == 'all':
1204
1205 args[i] = slice(None)
1206 else:
1207 break
1208 largs_nonstring += 1
1209
1210 if largs_nonstring >= 1:
1211 s_indx.append(args[0])
1212 if __debug__ and 'CHECK_DS_SELECT' in debug.active:
1213 _validate_indexes_uniq_sorted(args[0], 'select', 'samples')
1214 if largs_nonstring == 2:
1215 f_indx.append(args[1])
1216 if __debug__ and 'CHECK_DS_SELECT' in debug.active:
1217 _validate_indexes_uniq_sorted(args[1], 'select', 'features')
1218 elif largs_nonstring > 2:
1219 raise ValueError, "Only two positional arguments are allowed" \
1220 ". 1st for samples, 2nd for features"
1221
1222
1223
1224
1225 if (largs - largs_nonstring) % 2 != 0:
1226 raise ValueError, "Positional selections must come in pairs:" \
1227 " e.g. ('labels', [1,2,3])"
1228
1229 for i in xrange(largs_nonstring, largs, 2):
1230 k, v = args[i:i+2]
1231 kwargs[k] = v
1232
1233
1234 data_ = self._data
1235 for k, v in kwargs.iteritems():
1236 if k == 'samples':
1237 s_indx.append(v)
1238 elif k == 'features':
1239 f_indx.append(v)
1240 elif data_.has_key(k):
1241
1242
1243 if __debug__:
1244 if not N.any([isinstance(v, cls) for cls in
1245 [list, tuple, slice, int]]):
1246 raise ValueError, "Trying to specify selection for %s " \
1247 "based on unsupported '%s'" % (k, v)
1248 s_indx.append(self._getSampleIdsByAttr(v, attrib=k, sort=False))
1249 else:
1250 raise ValueError, 'Keyword "%s" is not known, thus' \
1251 'select() failed' % k
1252
1253 def combine_indexes(indx, nelements):
1254 """Helper function: intersect selections given in indx
1255
1256 :Parameters:
1257 indxs : list of lists or slices
1258 selections of elements
1259 nelements : int
1260 number of elements total for deriving indexes from slices
1261 """
1262 indx_sel = None
1263 for s in indx:
1264 if isinstance(s, slice) or \
1265 isinstance(s, N.ndarray) and s.dtype==bool:
1266
1267
1268
1269 all_indexes = N.arange(nelements)
1270 s = all_indexes[s]
1271 elif not operator.isSequenceType(s):
1272 s = [ s ]
1273
1274 if indx_sel is None:
1275 indx_sel = Set(s)
1276 else:
1277
1278
1279
1280 indx_sel = indx_sel.intersection(s)
1281
1282
1283 if isinstance(indx_sel, Set):
1284 indx_sel = list(indx_sel)
1285
1286
1287 indx_sel.sort()
1288
1289 return indx_sel
1290
1291
1292 if len(s_indx) == 1 and isinstance(s_indx[0], slice) \
1293 and s_indx[0] == slice(None):
1294
1295 s_indx = s_indx[0]
1296 else:
1297
1298 if len(s_indx) == 0:
1299 s_indx = None
1300 else:
1301 s_indx = combine_indexes(s_indx, self.nsamples)
1302
1303
1304 if len(f_indx):
1305 f_indx = combine_indexes(f_indx, self.nfeatures)
1306 else:
1307 f_indx = None
1308
1309 return s_indx, f_indx
1310
1311
1312 - def select(self, *args, **kwargs):
1313 """Universal selector
1314
1315 WARNING: if you need to select duplicate samples
1316 (e.g. samples=[5,5]) or order of selected samples of features
1317 is important and has to be not ordered (e.g. samples=[3,2,1]),
1318 please use selectFeatures or selectSamples functions directly
1319
1320 Examples:
1321 Mimique plain selectSamples::
1322
1323 dataset.select([1,2,3])
1324 dataset[[1,2,3]]
1325
1326 Mimique plain selectFeatures::
1327
1328 dataset.select(slice(None), [1,2,3])
1329 dataset.select('all', [1,2,3])
1330 dataset[:, [1,2,3]]
1331
1332 Mixed (select features and samples)::
1333
1334 dataset.select([1,2,3], [1, 2])
1335 dataset[[1,2,3], [1, 2]]
1336
1337 Select samples matching some attributes::
1338
1339 dataset.select(labels=[1,2], chunks=[2,4])
1340 dataset.select('labels', [1,2], 'chunks', [2,4])
1341 dataset['labels', [1,2], 'chunks', [2,4]]
1342
1343 Mixed -- out of first 100 samples, select only those with
1344 labels 1 or 2 and belonging to chunks 2 or 4, and select
1345 features 2 and 3::
1346
1347 dataset.select(slice(0,100), [2,3], labels=[1,2], chunks=[2,4])
1348 dataset[:100, [2,3], 'labels', [1,2], 'chunks', [2,4]]
1349
1350 """
1351 s_indx, f_indx = self.index(*args, **kwargs)
1352
1353
1354 if s_indx == slice(None):
1355
1356
1357 if __debug__:
1358 debug('DS', 'in select() not selecting samples')
1359 ds = self
1360 else:
1361
1362 if __debug__:
1363 debug('DS', 'in select() selecting samples given selections'
1364 + str(s_indx))
1365 ds = self.selectSamples(s_indx)
1366
1367
1368 if f_indx is not None:
1369 if __debug__:
1370 debug('DS', 'in select() selecting features given selections'
1371 + str(f_indx))
1372 ds = ds.selectFeatures(f_indx)
1373
1374 return ds
1375
1376
1377
1378 - def where(self, *args, **kwargs):
1379 """Obtain indexes of interesting samples/features. See select() for more information
1380
1381 XXX somewhat obsoletes idsby...
1382 """
1383 s_indx, f_indx = self.index(*args, **kwargs)
1384 if s_indx is not None and f_indx is not None:
1385 return s_indx, f_indx
1386 elif s_indx is not None:
1387 return s_indx
1388 else:
1389 return f_indx
1390
1391
1393 """Convinience dataset parts selection
1394
1395 See select for more information
1396 """
1397
1398 if len(args) == 1 and isinstance(args[0], tuple):
1399 args = args[0]
1400
1401 args_, args = args, ()
1402 for a in args_:
1403 if isinstance(a, slice) and \
1404 isinstance(a.start, basestring):
1405
1406 if a.stop is None or a.step is not None:
1407 raise ValueError, \
1408 "Selection must look like ['chunks':[2,3]]"
1409 args += (a.start, a.stop)
1410 else:
1411 args += (a,)
1412 return self.select(*args)
1413
1414
1415 - def permuteLabels(self, status, perchunk=True, assure_permute=False):
1416 """Permute the labels.
1417
1418 TODO: rename status into something closer in semantics.
1419
1420 :Parameters:
1421 status : bool
1422 Calling this method with set to True, the labels are
1423 permuted among all samples. If 'status' is False the
1424 original labels are restored.
1425 perchunk : bool
1426 If True permutation is limited to samples sharing the same
1427 chunk value. Therefore only the association of a certain
1428 sample with a label is permuted while keeping the absolute
1429 number of occurences of each label value within a certain
1430 chunk constant.
1431 assure_permute : bool
1432 If True, assures that labels are permutted, ie any one is
1433 different from the original one
1434 """
1435
1436 _data = self._data
1437
1438 if len(self.uniquelabels)<2:
1439 raise RuntimeError, \
1440 "Call to permuteLabels is bogus since there is insuficient" \
1441 " number of labels: %s" % self.uniquelabels
1442
1443 if not status:
1444
1445 if _data.get('origlabels', None) is None:
1446 raise RuntimeError, 'Cannot restore labels. ' \
1447 'permuteLabels() has never been ' \
1448 'called with status == True.'
1449 self.labels = _data['origlabels']
1450 _data.pop('origlabels')
1451 else:
1452
1453
1454 if not _data.has_key('origlabels') \
1455 or _data['origlabels'] == None:
1456
1457 _data['origlabels'] = _data['labels']
1458
1459 _data['labels'] = copy.copy(_data['labels'])
1460
1461 labels = _data['labels']
1462
1463 if perchunk:
1464 for o in self.uniquechunks:
1465 labels[self.chunks == o] = \
1466 N.random.permutation(labels[self.chunks == o])
1467 else:
1468 labels = N.random.permutation(labels)
1469
1470 self.labels = labels
1471
1472 if assure_permute:
1473 if not (_data['labels'] != _data['origlabels']).any():
1474 if not (assure_permute is True):
1475 if assure_permute == 1:
1476 raise RuntimeError, \
1477 "Cannot assure permutation of labels %s for " \
1478 "some reason with chunks %s and while " \
1479 "perchunk=%s . Should not happen" % \
1480 (self.labels, self.chunks, perchunk)
1481 else:
1482 assure_permute = 11
1483 if __debug__:
1484 debug("DS", "Recalling permute to assure different labels")
1485 self.permuteLabels(status, perchunk=perchunk,
1486 assure_permute=assure_permute-1)
1487
1488
1490 """Select a random set of samples.
1491
1492 If 'nperlabel' is an integer value, the specified number of samples is
1493 randomly choosen from the group of samples sharing a unique label
1494 value ( total number of selected samples: nperlabel x len(uniquelabels).
1495
1496 If 'nperlabel' is a list which's length has to match the number of
1497 unique label values. In this case 'nperlabel' specifies the number of
1498 samples that shall be selected from the samples with the corresponding
1499 label.
1500
1501 The method returns a Dataset object containing the selected
1502 samples.
1503 """
1504
1505 if isinstance(nperlabel, int):
1506 nperlabel = [ nperlabel for i in self.uniquelabels ]
1507
1508 sample = []
1509
1510 labels = self.labels
1511 for i, r in enumerate(self.uniquelabels):
1512
1513 sample += random.sample( (labels == r).nonzero()[0],
1514 nperlabel[i] )
1515
1516 return self.selectSamples( sample )
1517
1518
1519
1520
1521
1522
1523
1524
1525
1527 """Currently available number of patterns.
1528 """
1529 return self._data['samples'].shape[0]
1530
1531
1533 """Number of features per pattern.
1534 """
1535 return self._data['samples'].shape[1]
1536
1537
1539 """Stored labels map (if any)
1540 """
1541 return self._dsattr.get('labels_map', None)
1542
1543
1545 """Set labels map.
1546
1547 Checks for the validity of the mapping -- values should cover
1548 all existing labels in the dataset
1549 """
1550 values = Set(lm.values())
1551 labels = Set(self.uniquelabels)
1552 if not values.issuperset(labels):
1553 raise ValueError, \
1554 "Provided mapping %s has some existing labels (out of %s) " \
1555 "missing from mapping" % (list(values), list(labels))
1556 self._dsattr['labels_map'] = lm
1557
1558
1560 """Set the data type of the samples array.
1561 """
1562
1563 _data = self._data
1564
1565 if _data['samples'].dtype != dtype:
1566 _data['samples'] = _data['samples'].astype(dtype)
1567
1568
1570 """Assign `definition` to featuregroups
1571
1572 XXX Feature-groups was not finished to be useful
1573 """
1574 if not len(definition) == self.nfeatures:
1575 raise ValueError, \
1576 "Length of feature group definition %i " \
1577 "does not match the number of features %i " \
1578 % (len(definition), self.nfeatures)
1579
1580 self._dsattr['featuregroups'] = N.array(definition)
1581
1582
1584 """Returns a boolean mask with all features in `ids` selected.
1585
1586 :Parameters:
1587 ids: list or 1d array
1588 To be selected features ids.
1589
1590 :Returns:
1591 ndarray: dtype='bool'
1592 All selected features are set to True; False otherwise.
1593 """
1594 fmask = N.repeat(False, self.nfeatures)
1595 fmask[ids] = True
1596
1597 return fmask
1598
1599
1601 """Returns feature ids corresponding to non-zero elements in the mask.
1602
1603 :Parameters:
1604 mask: 1d ndarray
1605 Feature mask.
1606
1607 :Returns:
1608 ndarray: integer
1609 Ids of non-zero (non-False) mask elements.
1610 """
1611 return mask.nonzero()[0]
1612
1613
1614 @staticmethod
1616 """Common sanity check for Dataset copy constructor calls."""
1617
1618 samples = None
1619 if kwargs.has_key('samples'):
1620 samples = kwargs['samples']
1621 if samples is None and kwargs.has_key('data') \
1622 and kwargs['data'].has_key('samples'):
1623 samples = kwargs['data']['samples']
1624 if samples is None:
1625 raise DatasetError, \
1626 "`samples` must be provided to copy constructor call."
1627
1628 if not len(samples.shape) == 2:
1629 raise DatasetError, \
1630 "samples must be in 2D shape in copy constructor call."
1631
1632
1633
1634 nsamples = property( fget=getNSamples )
1635 nfeatures = property( fget=getNFeatures )
1636 labels_map = property( fget=getLabelsMap, fset=setLabelsMap )
1637
1639 """Decorator to easily bind functions to a Dataset class
1640 """
1641 if __debug__:
1642 debug("DS_", "Binding function %s to Dataset class" % func.func_name)
1643
1644
1645 setattr(Dataset, func.func_name, func)
1646
1647
1648 return func
1649
1650
1651
1652 Dataset._registerAttribute("samples", "_data", abbr='S', hasunique=False)
1653 Dataset._registerAttribute("labels", "_data", abbr='L', hasunique=True)
1654 Dataset._registerAttribute("chunks", "_data", abbr='C', hasunique=True)
1655
1656 Dataset._registerAttribute("origids", "_data", abbr='I', hasunique=False)
1657