Package mvpa :: Package datasets :: Module base
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.base

   1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  #ex: set sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Dataset container""" 
  10   
  11  __docformat__ = 'restructuredtext' 
  12   
  13  import operator 
  14  import random 
  15  import mvpa.support.copy as copy 
  16  import numpy as N 
  17   
  18  from sets import Set 
  19   
  20  # Sooner or later Dataset would become ClassWithCollections as well, but for 
  21  # now just an object -- thus commenting out tentative changes 
  22  # 
  23  #XXX from mvpa.misc.state import ClassWithCollections, SampleAttribute 
  24   
  25  from mvpa.misc.exceptions import DatasetError 
  26  from mvpa.misc.support import idhash as idhash_ 
  27  from mvpa.base.dochelpers import enhancedDocString, table2string 
  28   
  29  if __debug__: 
  30      from mvpa.base import debug, warning 
  31   
32 - def _validate_indexes_uniq_sorted(seq, fname, item):
33 """Helper function to validate that seq contains unique sorted values 34 """ 35 if operator.isSequenceType(seq): 36 seq_unique = N.unique(seq) 37 if len(seq) != len(seq_unique): 38 warning("%s() operates only with indexes for %s without" 39 " repetitions. Repetitions were removed." 40 % (fname, item)) 41 if N.any(N.sort(seq) != seq_unique): 42 warning("%s() does not guarantee the original order" 43 " of selected %ss. Use selectSamples() and " 44 " selectFeatures(sort=False) instead" % (fname, item))
45 46 47 #XXX class Dataset(ClassWithCollections):
48 -class Dataset(object):
49 """*The* Dataset. 50 51 This class provides a container to store all necessary data to 52 perform MVPA analyses. These are the data samples, as well as the 53 labels associated with the samples. Additionally, samples can be 54 grouped into chunks. 55 56 :Groups: 57 - `Creators`: `__init__`, `selectFeatures`, `selectSamples`, 58 `applyMapper` 59 - `Mutators`: `permuteLabels` 60 61 Important: labels assumed to be immutable, i.e. noone should modify 62 them externally by accessing indexed items, ie something like 63 ``dataset.labels[1] += "_bad"`` should not be used. If a label has 64 to be modified, full copy of labels should be obtained, operated on, 65 and assigned back to the dataset, otherwise dataset.uniquelabels 66 would not work. The same applies to any other attribute which has 67 corresponding unique* access property. 68 69 """ 70 # XXX Notes about migration to use Collections to store data and 71 # attributes for samples, features, and dataset itself: 72 73 # changes: 74 # _data -> s_attr collection (samples attributes) 75 # _dsattr -> ds_attr collection 76 # f_attr collection (features attributes) 77 78 # static definition to track which unique attributes 79 # have to be reset/recomputed whenever anything relevant 80 # changes 81 82 # unique{labels,chunks} become a part of dsattr 83 _uniqueattributes = [] 84 """Unique attributes associated with the data""" 85 86 _registeredattributes = [] 87 """Registered attributes (stored in _data)""" 88 89 _requiredattributes = ['samples', 'labels'] 90 """Attributes which have to be provided to __init__, or otherwise 91 no default values would be assumed and construction of the 92 instance would fail""" 93 94 #XXX _ATTRIBUTE_COLLECTIONS = [ 's_attr', 'f_attr', 'ds_attr' ] 95 #XXX """Assure those 3 collections to be present in all datasets""" 96 #XXX 97 #XXX samples__ = SampleAttribute(doc="Samples data. 0th index is time", hasunique=False) # XXX 98 #XXX labels__ = SampleAttribute(doc="Labels for the samples", hasunique=True) 99 #XXX chunks__ = SampleAttribute(doc="Chunk identities for the samples", hasunique=True) 100 #XXX # samples ids (already unique by definition) 101 #XXX origids__ = SampleAttribute(doc="Chunk identities for the samples", hasunique=False) 102
103 - def __init__(self, 104 # for copy constructor 105 data=None, 106 dsattr=None, 107 # automatic dtype conversion 108 dtype=None, 109 # new instances 110 samples=None, 111 labels=None, 112 labels_map=None, 113 chunks=None, 114 origids=None, 115 # flags 116 check_data=True, 117 copy_samples=False, 118 copy_data=True, 119 copy_dsattr=True):
120 """Initialize dataset instance 121 122 There are basically two different way to create a dataset: 123 124 1. Create a new dataset from samples and sample attributes. In 125 this mode a two-dimensional `ndarray` has to be passed to the 126 `samples` keyword argument and the corresponding samples 127 attributes are provided via the `labels` and `chunks` 128 arguments. 129 130 2. Copy contructor mode 131 The second way is used internally to perform quick coyping 132 of datasets, e.g. when performing feature selection. In this 133 mode and the two dictionaries (`data` and `dsattr`) are 134 required. For performance reasons this mode bypasses most of 135 the sanity check performed by the previous mode, as for 136 internal operations data integrity is assumed. 137 138 139 :Parameters: 140 data : dict 141 Dictionary with an arbitrary number of entries. The value for 142 each key in the dict has to be an ndarray with the 143 same length as the number of rows in the samples array. 144 A special entry in this dictionary is 'samples', a 2d array 145 (samples x features). A shallow copy is stored in the object. 146 dsattr : dict 147 Dictionary of dataset attributes. An arbitrary number of 148 arbitrarily named and typed objects can be stored here. A 149 shallow copy of the dictionary is stored in the object. 150 dtype: type | None 151 If None -- do not change data type if samples 152 is an ndarray. Otherwise convert samples to dtype. 153 154 155 :Keywords: 156 samples : ndarray 157 2d array (samples x features) 158 labels 159 An array or scalar value defining labels for each samples 160 labels_map : None or bool or dict 161 Map from labels into literal names. If is None or True, 162 the mapping is computed, from labels which must be literal. 163 If is False, no mapping is computed. If dict -- mapping is 164 verified and taken, labels get remapped. Dict must map 165 literal -> number 166 chunks 167 An array or scalar value defining chunks for each sample 168 169 Each of the Keywords arguments overwrites what is/might be 170 already in the `data` container. 171 172 """ 173 174 #XXX ClassWithCollections.__init__(self) 175 176 # see if data and dsattr are none, if so, make them empty dicts 177 if data is None: 178 data = {} 179 if dsattr is None: 180 dsattr = {} 181 182 # initialize containers; default values are empty dicts 183 # always make a shallow copy of what comes in, otherwise total chaos 184 # is likely to happen soon 185 if copy_data: 186 # deep copy (cannot use copy.deepcopy, because samples is an 187 # exception 188 # but shallow copy first to get a shared version of the data in 189 # any case 190 lcl_data = data.copy() 191 for k, v in data.iteritems(): 192 # skip copying samples if requested 193 if k == 'samples' and not copy_samples: 194 continue 195 lcl_data[k] = v.copy() 196 else: 197 # shallow copy 198 # XXX? yoh: it might be better speed wise just assign dictionary 199 # without any shallow .copy 200 lcl_data = data.copy() 201 202 if copy_dsattr and len(dsattr)>0: 203 # deep copy 204 if __debug__: 205 debug('DS', "Deep copying dsattr %s" % `dsattr`) 206 lcl_dsattr = copy.deepcopy(dsattr) 207 208 else: 209 # shallow copy 210 lcl_dsattr = copy.copy(dsattr) 211 212 # has to be not private since otherwise derived methods 213 # would have problem accessing it and _registerAttribute 214 # would fail on lambda getters 215 self._data = lcl_data 216 """What makes a dataset.""" 217 218 self._dsattr = lcl_dsattr 219 """Dataset attriibutes.""" 220 221 # store samples (and possibly transform/reshape/retype them) 222 if not samples == None: 223 if __debug__: 224 if lcl_data.has_key('samples'): 225 debug('DS', 226 "`Data` dict has `samples` (%s) but there is also" \ 227 " __init__ parameter `samples` which overrides " \ 228 " stored in `data`" % (`lcl_data['samples'].shape`)) 229 lcl_data['samples'] = self._shapeSamples(samples, dtype, 230 copy_samples) 231 232 # TODO? we might want to have the same logic for chunks and labels 233 # ie if no labels present -- assign arange 234 # MH: don't think this is necessary -- or is there a use case? 235 # labels 236 if not labels == None: 237 if __debug__: 238 if lcl_data.has_key('labels'): 239 debug('DS', 240 "`Data` dict has `labels` (%s) but there is also" + 241 " __init__ parameter `labels` which overrides " + 242 " stored in `data`" % (`lcl_data['labels']`)) 243 if lcl_data.has_key('samples'): 244 lcl_data['labels'] = \ 245 self._expandSampleAttribute(labels, 'labels') 246 247 # check if we got all required attributes 248 for attr in self._requiredattributes: 249 if not lcl_data.has_key(attr): 250 raise DatasetError, \ 251 "Attribute %s is required to initialize dataset" % \ 252 attr 253 254 nsamples = self.nsamples 255 256 # chunks 257 if not chunks == None: 258 lcl_data['chunks'] = \ 259 self._expandSampleAttribute(chunks, 'chunks') 260 elif not lcl_data.has_key('chunks'): 261 # if no chunk information is given assume that every pattern 262 # is its own chunk 263 lcl_data['chunks'] = N.arange(nsamples) 264 265 # samples origids 266 if not origids is None: 267 # simply assign if provided 268 lcl_data['origids'] = origids 269 elif not lcl_data.has_key('origids'): 270 # otherwise contruct unqiue ones 271 lcl_data['origids'] = N.arange(len(lcl_data['labels'])) 272 else: 273 # assume origids have been specified already (copy constructor 274 # mode) leave them as they are, e.g. to make origids survive 275 # selectSamples() 276 pass 277 278 # Initialize attributes which are registered but were not setup 279 for attr in self._registeredattributes: 280 if not lcl_data.has_key(attr): 281 if __debug__: 282 debug("DS", "Initializing attribute %s" % attr) 283 lcl_data[attr] = N.zeros(nsamples) 284 285 # labels_map 286 labels_ = N.asarray(lcl_data['labels']) 287 labels_map_known = lcl_dsattr.has_key('labels_map') 288 if labels_map is True: 289 # need to composte labels_map 290 if labels_.dtype.char == 'S' or not labels_map_known: 291 # Create mapping 292 ulabels = list(Set(labels_)) 293 ulabels.sort() 294 labels_map = dict([ (x[1], x[0]) for x in enumerate(ulabels) ]) 295 if __debug__: 296 debug('DS', 'Mapping for the labels computed to be %s' 297 % labels_map) 298 else: 299 if __debug__: 300 debug('DS', 'Mapping of labels was requested but labels ' 301 'are not strings. Skipped') 302 labels_map = None 303 pass 304 elif labels_map is False: 305 labels_map = None 306 307 if isinstance(labels_map, dict): 308 if labels_map_known: 309 if __debug__: 310 debug('DS', 311 "`dsattr` dict has `labels_map` (%s) but there is also" \ 312 " __init__ parameter `labels_map` (%s) which overrides " \ 313 " stored in `dsattr`" % (lcl_dsattr['labels_map'], labels_map)) 314 315 lcl_dsattr['labels_map'] = labels_map 316 # map labels if needed (if strings or was explicitely requested) 317 if labels_.dtype.char == 'S' or not labels_map_known: 318 if __debug__: 319 debug('DS_', "Remapping labels using mapping %s" % labels_map) 320 # need to remap 321 # !!! N.array is important here 322 try: 323 lcl_data['labels'] = N.array( 324 [labels_map[x] for x in lcl_data['labels']]) 325 except KeyError, e: 326 raise ValueError, "Provided labels_map %s is insufficient " \ 327 "to map all the labels. Mapping for label %s is " \ 328 "missing" % (labels_map, e) 329 330 elif not lcl_dsattr.has_key('labels_map'): 331 lcl_dsattr['labels_map'] = labels_map 332 elif __debug__: 333 debug('DS_', 'Not overriding labels_map in dsattr since it has one') 334 335 if check_data: 336 self._checkData() 337 338 # lazy computation of unique members 339 #self._resetallunique('_dsattr', self._dsattr) 340 341 # Michael: we cannot do this conditional here. When selectSamples() 342 # removes a whole data chunk the uniquechunks values will be invalid. 343 # Same applies to labels of course. 344 if not labels is None or not chunks is None: 345 # for a speed up to don't go through all uniqueattributes 346 # when no need 347 lcl_dsattr['__uniquereseted'] = False 348 self._resetallunique(force=True)
349 350 351 __doc__ = enhancedDocString('Dataset', locals()) 352 353 354 @property
355 - def idhash(self):
356 """To verify if dataset is in the same state as when smth else was done 357 358 Like if classifier was trained on the same dataset as in question""" 359 360 _data = self._data 361 res = idhash_(_data) 362 363 # we cannot count on the order the values in the dict will show up 364 # with `self._data.value()` and since idhash will be order-dependent 365 # we have to make it deterministic 366 keys = _data.keys() 367 keys.sort() 368 for k in keys: 369 res += idhash_(_data[k]) 370 return res
371 372
373 - def _resetallunique(self, force=False):
374 """Set to None all unique* attributes of corresponding dictionary 375 """ 376 _dsattr = self._dsattr 377 378 if not force and _dsattr['__uniquereseted']: 379 return 380 381 _uniqueattributes = self._uniqueattributes 382 383 if __debug__ and "DS_" in debug.active: 384 debug("DS_", "Reseting all attributes %s for dataset %s" 385 % (_uniqueattributes, 386 self.summary(uniq=False, idhash=False, 387 stats=False, lstats=False))) 388 389 # I guess we better checked if dictname is known but... 390 for k in _uniqueattributes: 391 _dsattr[k] = None 392 _dsattr['__uniquereseted'] = True
393 394
395 - def _getuniqueattr(self, attrib, dict_):
396 """Provide common facility to return unique attributes 397 398 XXX `dict_` can be simply replaced now with self._dsattr 399 """ 400 401 # local bindings 402 _dsattr = self._dsattr 403 404 if not _dsattr.has_key(attrib) or _dsattr[attrib] is None: 405 if __debug__ and 'DS_' in debug.active: 406 debug("DS_", "Recomputing unique set for attrib %s within %s" % 407 (attrib, self.summary(uniq=False, 408 stats=False, lstats=False))) 409 # uff... might come up with better strategy to keep relevant 410 # attribute name 411 _dsattr[attrib] = N.unique( N.asanyarray(dict_[attrib[6:]]) ) 412 assert(not _dsattr[attrib] is None) 413 _dsattr['__uniquereseted'] = False 414 415 return _dsattr[attrib]
416 417
418 - def _setdataattr(self, attrib, value):
419 """Provide common facility to set attributes 420 421 """ 422 if len(value) != self.nsamples: 423 raise ValueError, \ 424 "Provided %s have %d entries while there is %d samples" % \ 425 (attrib, len(value), self.nsamples) 426 self._data[attrib] = N.asarray(value) 427 uniqueattr = "unique" + attrib 428 429 _dsattr = self._dsattr 430 if _dsattr.has_key(uniqueattr): 431 _dsattr[uniqueattr] = None
432 433
434 - def _getNSamplesPerAttr( self, attrib='labels' ):
435 """Returns the number of samples per unique label. 436 """ 437 # local bindings 438 _data = self._data 439 440 # XXX hardcoded dict_=self._data.... might be in self._dsattr 441 uniqueattr = self._getuniqueattr(attrib="unique" + attrib, 442 dict_=_data) 443 444 # use dictionary to cope with arbitrary labels 445 result = dict(zip(uniqueattr, [ 0 ] * len(uniqueattr))) 446 for l in _data[attrib]: 447 result[l] += 1 448 449 # XXX only return values to mimic the old interface but we might want 450 # to return the full dict instead 451 # return result 452 return result
453 454
455 - def _getSampleIdsByAttr(self, values, attrib="labels", 456 sort=True):
457 """Return indecies of samples given a list of attributes 458 """ 459 460 if not operator.isSequenceType(values) \ 461 or isinstance(values, basestring): 462 values = [ values ] 463 464 # TODO: compare to plain for loop through the labels 465 # on a real data example 466 sel = N.array([], dtype=N.int16) 467 _data = self._data 468 for value in values: 469 sel = N.concatenate(( 470 sel, N.where(_data[attrib]==value)[0])) 471 472 if sort: 473 # place samples in the right order 474 sel.sort() 475 476 return sel
477 478
479 - def idsonboundaries(self, prior=0, post=0, 480 attributes_to_track=['labels', 'chunks'], 481 affected_labels=None, 482 revert=False):
483 """Find samples which are on the boundaries of the blocks 484 485 Such samples might need to be removed. By default (with 486 prior=0, post=0) ids of the first samples in a 'block' are 487 reported 488 489 :Parameters: 490 prior : int 491 how many samples prior to transition sample to include 492 post : int 493 how many samples post the transition sample to include 494 attributes_to_track : list of basestring 495 which attributes to track to decide on the boundary condition 496 affected_labels : list of basestring 497 for which labels to perform selection. If None - for all 498 revert : bool 499 either to revert the meaning and provide ids of samples which are found 500 to not to be boundary samples 501 """ 502 # local bindings 503 _data = self._data 504 labels = self.labels 505 nsamples = self.nsamples 506 507 lastseen = [None for attr in attributes_to_track] 508 transitions = [] 509 510 for i in xrange(nsamples): 511 current = [_data[attr][i] for attr in attributes_to_track] 512 if lastseen != current: 513 # transition point 514 new_transitions = range(max(0, i-prior), 515 min(nsamples-1, i+post)+1) 516 if affected_labels is not None: 517 new_transitions = filter(lambda i: labels[i] in affected_labels, 518 new_transitions) 519 transitions += new_transitions 520 lastseen = current 521 522 transitions = Set(transitions) 523 if revert: 524 transitions = Set(range(nsamples)).difference(transitions) 525 526 # postprocess 527 transitions = N.array(list(transitions)) 528 transitions.sort() 529 return list(transitions)
530 531
532 - def _shapeSamples(self, samples, dtype, copy):
533 """Adapt different kinds of samples 534 535 Handle all possible input value for 'samples' and tranform 536 them into a 2d (samples x feature) representation. 537 """ 538 # put samples array into correct shape 539 # 1d arrays or simple sequences are assumed to be a single pattern 540 if (not isinstance(samples, N.ndarray)): 541 # it is safe to provide dtype which defaults to None, 542 # when N would choose appropriate dtype automagically 543 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 544 else: 545 if samples.ndim < 2 \ 546 or (not dtype is None and dtype != samples.dtype): 547 if dtype is None: 548 dtype = samples.dtype 549 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 550 elif copy: 551 samples = samples.copy() 552 553 # only samples x features matrices are supported 554 if len(samples.shape) > 2: 555 raise DatasetError, "Only (samples x features) -> 2d sample " \ 556 + "are supported (got %s shape of samples)." \ 557 % (`samples.shape`) \ 558 +" Consider MappedDataset if applicable." 559 560 return samples
561 562
563 - def _checkData(self):
564 """Checks `_data` members to have the same # of samples. 565 """ 566 # 567 # XXX: Maybe just run this under __debug__ and remove the `check_data` 568 # from the constructor, which is too complicated anyway? 569 # 570 571 # local bindings 572 nsamples = self.nsamples 573 _data = self._data 574 575 for k, v in _data.iteritems(): 576 if not len(v) == nsamples: 577 raise DatasetError, \ 578 "Length of sample attribute '%s' [%i] does not " \ 579 "match the number of samples in the dataset [%i]." \ 580 % (k, len(v), nsamples) 581 582 # check for unique origids 583 uniques = N.unique(_data['origids']) 584 uniques.sort() 585 # need to copy to prevent sorting the original array 586 sorted_ids = _data['origids'].copy() 587 sorted_ids.sort() 588 589 if not (uniques == sorted_ids).all(): 590 raise DatasetError, "Samples IDs are not unique."
591 592
593 - def _expandSampleAttribute(self, attr, attr_name):
594 """If a sample attribute is given as a scalar expand/repeat it to a 595 length matching the number of samples in the dataset. 596 """ 597 try: 598 # if we are initializing with a single string -- we should 599 # treat it as a single label 600 if isinstance(attr, basestring): 601 raise TypeError 602 if len(attr) != self.nsamples: 603 raise DatasetError, \ 604 "Length of sample attribute '%s' [%d]" \ 605 % (attr_name, len(attr)) \ 606 + " has to match the number of samples" \ 607 + " [%d]." % self.nsamples 608 # store the sequence as array 609 return N.array(attr) 610 611 except TypeError: 612 # make sequence of identical value matching the number of 613 # samples 614 return N.repeat(attr, self.nsamples)
615 616 617 @classmethod
618 - def _registerAttribute(cls, key, dictname="_data", abbr=None, hasunique=False):
619 """Register an attribute for any Dataset class. 620 621 Creates property assigning getters/setters depending on the 622 availability of corresponding _get, _set functions. 623 """ 624 classdict = cls.__dict__ 625 if not classdict.has_key(key): 626 if __debug__: 627 debug("DS", "Registering new attribute %s" % key) 628 # define get function and use corresponding 629 # _getATTR if such defined 630 getter = '_get%s' % key 631 if classdict.has_key(getter): 632 getter = '%s.%s' % (cls.__name__, getter) 633 else: 634 getter = "lambda x: x.%s['%s']" % (dictname, key) 635 636 # define set function and use corresponding 637 # _setATTR if such defined 638 setter = '_set%s' % key 639 if classdict.has_key(setter): 640 setter = '%s.%s' % (cls.__name__, setter) 641 elif dictname=="_data": 642 setter = "lambda self,x: self._setdataattr" + \ 643 "(attrib='%s', value=x)" % (key) 644 else: 645 setter = None 646 647 if __debug__: 648 debug("DS", "Registering new property %s.%s" % 649 (cls.__name__, key)) 650 exec "%s.%s = property(fget=%s, fset=%s)" % \ 651 (cls.__name__, key, getter, setter) 652 653 if abbr is not None: 654 exec "%s.%s = property(fget=%s, fset=%s)" % \ 655 (cls.__name__, abbr, getter, setter) 656 657 if hasunique: 658 uniquekey = "unique%s" % key 659 getter = '_get%s' % uniquekey 660 if classdict.has_key(getter): 661 getter = '%s.%s' % (cls.__name__, getter) 662 else: 663 getter = "lambda x: x._getuniqueattr" + \ 664 "(attrib='%s', dict_=x.%s)" % (uniquekey, dictname) 665 666 if __debug__: 667 debug("DS", "Registering new property %s.%s" % 668 (cls.__name__, uniquekey)) 669 670 exec "%s.%s = property(fget=%s)" % \ 671 (cls.__name__, uniquekey, getter) 672 if abbr is not None: 673 exec "%s.U%s = property(fget=%s)" % \ 674 (cls.__name__, abbr, getter) 675 676 # create samplesper<ATTR> properties 677 sampleskey = "samplesper%s" % key[:-1] # remove ending 's' XXX 678 if __debug__: 679 debug("DS", "Registering new property %s.%s" % 680 (cls.__name__, sampleskey)) 681 682 exec "%s.%s = property(fget=%s)" % \ 683 (cls.__name__, sampleskey, 684 "lambda x: x._getNSamplesPerAttr(attrib='%s')" % key) 685 686 cls._uniqueattributes.append(uniquekey) 687 688 # create idsby<ATTR> properties 689 sampleskey = "idsby%s" % key # remove ending 's' XXX 690 if __debug__: 691 debug("DS", "Registering new property %s.%s" % 692 (cls.__name__, sampleskey)) 693 694 exec "%s.%s = %s" % (cls.__name__, sampleskey, 695 "lambda self, x: " + 696 "self._getSampleIdsByAttr(x,attrib='%s')" % key) 697 698 cls._uniqueattributes.append(uniquekey) 699 700 cls._registeredattributes.append(key) 701 elif __debug__: 702 warning('Trying to reregister attribute `%s`. For now ' % key + 703 'such capability is not present')
704 705
706 - def __str__(self):
707 """String summary over the object 708 """ 709 return self.summary(uniq=True, 710 idhash=__debug__ and ('DS_ID' in debug.active), 711 stats=__debug__ and ('DS_STATS' in debug.active), 712 lstats=__debug__ and ('DS_STATS' in debug.active), 713 )
714 715
716 - def __repr__(self):
717 return "<%s>" % str(self)
718 719
720 - def summary(self, uniq=True, stats=True, idhash=False, lstats=True, 721 maxc=30, maxl=20):
722 """String summary over the object 723 724 :Parameters: 725 uniq : bool 726 Include summary over data attributes which have unique 727 idhash : bool 728 Include idhash value for dataset and samples 729 stats : bool 730 Include some basic statistics (mean, std, var) over dataset samples 731 lstats : bool 732 Include statistics on chunks/labels 733 maxc : int 734 Maximal number of chunks when provide details on labels/chunks 735 maxl : int 736 Maximal number of labels when provide details on labels/chunks 737 """ 738 # local bindings 739 samples = self.samples 740 _data = self._data 741 _dsattr = self._dsattr 742 743 if idhash: 744 idhash_ds = "{%s}" % self.idhash 745 idhash_samples = "{%s}" % idhash_(samples) 746 else: 747 idhash_ds = "" 748 idhash_samples = "" 749 750 s = """Dataset %s/ %s %d%s x %d""" % \ 751 (idhash_ds, samples.dtype, 752 self.nsamples, idhash_samples, self.nfeatures) 753 754 ssep = (' ', '\n')[lstats] 755 if uniq: 756 s += "%suniq:" % ssep 757 for uattr in _dsattr.keys(): 758 if not uattr.startswith("unique"): 759 continue 760 attr = uattr[6:] 761 try: 762 value = self._getuniqueattr(attrib=uattr, 763 dict_=_data) 764 s += " %d %s" % (len(value), attr) 765 except: 766 pass 767 768 if isinstance(self.labels_map, dict): 769 s += ' labels_mapped' 770 771 if stats: 772 # TODO -- avg per chunk? 773 # XXX We might like to use scipy.stats.describe to get 774 # quick summary statistics (mean/range/skewness/kurtosis) 775 s += "%sstats: mean=%g std=%g var=%g min=%g max=%g\n" % \ 776 (ssep, N.mean(samples), N.std(samples), 777 N.var(samples), N.min(samples), N.max(samples)) 778 779 if lstats: 780 s += self.summary_labels(maxc=maxc, maxl=maxl) 781 782 return s
783 784
785 - def summary_labels(self, maxc=30, maxl=20):
786 """Provide summary statistics over the labels and chunks 787 788 :Parameters: 789 maxc : int 790 Maximal number of chunks when provide details 791 maxl : int 792 Maximal number of labels when provide details 793 """ 794 # We better avoid bound function since if people only 795 # imported Dataset without miscfx it would fail 796 from mvpa.datasets.miscfx import getSamplesPerChunkLabel 797 spcl = getSamplesPerChunkLabel(self) 798 # XXX couldn't they be unordered? 799 ul = self.uniquelabels.tolist() 800 uc = self.uniquechunks.tolist() 801 s = "" 802 if len(ul) < maxl and len(uc) < maxc: 803 s += "\nCounts of labels in each chunk:" 804 # only in a resonable case do printing 805 table = [[' chunks\labels'] + ul] 806 table += [[''] + ['---'] * len(ul)] 807 for c, counts in zip(uc, spcl): 808 table.append([ str(c) ] + counts.tolist()) 809 s += '\n' + table2string(table) 810 else: 811 s += "No details due to large number of labels or chunks. " \ 812 "Increase maxc and maxl if desired" 813 814 labels_map = self.labels_map 815 if isinstance(labels_map, dict): 816 s += "\nOriginal labels were mapped using following mapping:" 817 s += '\n\t'+'\n\t'.join([':\t'.join(map(str, x)) 818 for x in labels_map.items()]) + '\n' 819 820 def cl_stats(axis, u, name1, name2): 821 """ Compute statistics per label 822 """ 823 stats = {'min': N.min(spcl, axis=axis), 824 'max': N.max(spcl, axis=axis), 825 'mean': N.mean(spcl, axis=axis), 826 'std': N.std(spcl, axis=axis), 827 '#%ss' % name2: N.sum(spcl>0, axis=axis)} 828 entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%ss' % name2] 829 table = [ entries ] 830 for i, l in enumerate(u): 831 d = {' ' + name1 : l} 832 d.update(dict([ (k, stats[k][i]) for k in stats.keys()])) 833 table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)] 834 % d[e] for e in entries] ) 835 return '\nSummary per %s across %ss\n' % (name1, name2) \ 836 + table2string(table)
837 838 if len(ul) < maxl: 839 s += cl_stats(0, ul, 'label', 'chunk') 840 if len(uc) < maxc: 841 s += cl_stats(1, uc, 'chunk', 'label') 842 return s
843 844
845 - def __iadd__(self, other):
846 """Merge the samples of one Dataset object to another (in-place). 847 848 No dataset attributes, besides labels_map, will be merged! 849 Additionally, a new set of unique `origids` will be generated. 850 """ 851 # local bindings 852 _data = self._data 853 other_data = other._data 854 855 if not self.nfeatures == other.nfeatures: 856 raise DatasetError, "Cannot add Dataset, because the number of " \ 857 "feature do not match." 858 859 # take care about labels_map and labels 860 slm = self.labels_map 861 olm = other.labels_map 862 if N.logical_xor(slm is None, olm is None): 863 raise ValueError, "Cannot add datasets where only one of them " \ 864 "has labels map assigned. If needed -- implement it" 865 866 # concatenate all sample attributes 867 for k,v in _data.iteritems(): 868 if k == 'origids': 869 # special case samples origids: for now just regenerate unique 870 # ones could also check if concatenation is unique, but it 871 # would be costly performance-wise 872 _data[k] = N.arange(len(v) + len(other_data[k])) 873 874 elif k == 'labels' and slm is not None: 875 # special care about labels if mapping was in effect, 876 # we need to append 2nd map to the first one and 877 # relabel 2nd dataset 878 nlm = slm.copy() 879 # figure out maximal numerical label used now 880 nextid = N.sort(nlm.values())[-1] + 1 881 olabels = other.labels 882 olabels_remap = {} 883 for ol, olnum in olm.iteritems(): 884 if not nlm.has_key(ol): 885 # check if we can preserve old numberic label 886 # if not -- assign some new one not yet present 887 # in any dataset 888 if olnum in nlm.values(): 889 nextid = N.sort(nlm.values() + olm.values())[-1] + 1 890 else: 891 nextid = olnum 892 olabels_remap[olnum] = nextid 893 nlm[ol] = nextid 894 nextid += 1 895 else: 896 olabels_remap[olnum] = nlm[ol] 897 olabels = [olabels_remap[x] for x in olabels] 898 # finally compose new labels 899 _data['labels'] = N.concatenate((v, olabels), axis=0) 900 # and reassign new mapping 901 self._dsattr['labels_map'] = nlm 902 903 if __debug__: 904 # check if we are not dealing with colliding 905 # mapping, since it is problematic and might lead 906 # to various complications 907 if (len(Set(slm.keys())) != len(Set(slm.values()))) or \ 908 (len(Set(olm.keys())) != len(Set(olm.values()))): 909 warning("Adding datasets where multiple labels " 910 "mapped to the same ID is not recommended. " 911 "Please check the outcome. Original mappings " 912 "were %s and %s. Resultant is %s" 913 % (slm, olm, nlm)) 914 915 else: 916 _data[k] = N.concatenate((v, other_data[k]), axis=0) 917 918 # might be more sophisticated but for now just reset -- it is safer ;) 919 self._resetallunique() 920 921 return self
922 923
924 - def __add__( self, other ):
925 """Merge the samples two Dataset objects. 926 927 All data of both datasets is copied, concatenated and a new Dataset is 928 returned. 929 930 NOTE: This can be a costly operation (both memory and time). If 931 performance is important consider the '+=' operator. 932 """ 933 # create a new object of the same type it is now and NOT only Dataset 934 out = super(Dataset, self).__new__(self.__class__) 935 936 # now init it: to make it work all Dataset contructors have to accept 937 # Class(data=Dict, dsattr=Dict) 938 out.__init__(data=self._data, 939 dsattr=self._dsattr, 940 copy_samples=True, 941 copy_data=True, 942 copy_dsattr=True) 943 944 out += other 945 946 return out
947 948
949 - def copy(self):
950 """Create a copy (clone) of the dataset, by fully copying current one 951 952 """ 953 # create a new object of the same type it is now and NOT only Dataset 954 out = super(Dataset, self).__new__(self.__class__) 955 956 # now init it: to make it work all Dataset contructors have to accept 957 # Class(data=Dict, dsattr=Dict) 958 out.__init__(data=self._data, 959 dsattr=self._dsattr, 960 copy_samples=True, 961 copy_data=True, 962 copy_dsattr=True) 963 964 return out
965 966
967 - def selectFeatures(self, ids=None, sort=True, groups=None):
968 """Select a number of features from the current set. 969 970 :Parameters: 971 ids 972 iterable container to select ids 973 sort : bool 974 if to sort Ids. Order matters and `selectFeatures` assumes 975 incremental order. If not such, in non-optimized code 976 selectFeatures would verify the order and sort 977 978 Returns a new Dataset object with a view of the original 979 samples array (no copying is performed). 980 981 WARNING: The order of ids determines the order of features in 982 the returned dataset. This might be useful sometimes, but can 983 also cause major headaches! Order would is verified when 984 running in non-optimized code (if __debug__) 985 """ 986 if ids is None and groups is None: 987 raise ValueError, "No feature selection specified." 988 989 # start with empty list if no ids where specified (so just groups) 990 if ids is None: 991 ids = [] 992 993 if not groups is None: 994 if not self._dsattr.has_key('featuregroups'): 995 raise RuntimeError, \ 996 "Dataset has no feature grouping information." 997 998 for g in groups: 999 ids += (self._dsattr['featuregroups'] == g).nonzero()[0].tolist() 1000 1001 # XXX set sort default to True, now sorting has to be explicitely 1002 # disabled and warning is not necessary anymore 1003 if sort: 1004 ids.sort() 1005 elif __debug__ and 'CHECK_DS_SORTED' in debug.active: 1006 from mvpa.misc.support import isSorted 1007 if not isSorted(ids): 1008 warning("IDs for selectFeatures must be provided " + 1009 "in sorted order, otherwise major headache might occur") 1010 1011 # shallow-copy all stuff from current data dict 1012 new_data = self._data.copy() 1013 1014 # assign the selected features -- data is still shared with 1015 # current dataset 1016 new_data['samples'] = self._data['samples'][:, ids] 1017 1018 # apply selection to feature groups as well 1019 if self._dsattr.has_key('featuregroups'): 1020 new_dsattr = self._dsattr.copy() 1021 new_dsattr['featuregroups'] = self._dsattr['featuregroups'][ids] 1022 else: 1023 new_dsattr = self._dsattr 1024 1025 # create a new object of the same type it is now and NOT onyl Dataset 1026 dataset = super(Dataset, self).__new__(self.__class__) 1027 1028 # now init it: to make it work all Dataset contructors have to accept 1029 # Class(data=Dict, dsattr=Dict) 1030 dataset.__init__(data=new_data, 1031 dsattr=new_dsattr, 1032 check_data=False, 1033 copy_samples=False, 1034 copy_data=False, 1035 copy_dsattr=False 1036 ) 1037 1038 return dataset
1039 1040
1041 - def applyMapper(self, featuresmapper=None, samplesmapper=None, 1042 train=True):
1043 """Obtain new dataset by applying mappers over features and/or samples. 1044 1045 While featuresmappers leave the sample attributes information 1046 unchanged, as the number of samples in the dataset is invariant, 1047 samplesmappers are also applied to the samples attributes themselves! 1048 1049 Applying a featuresmapper will destroy any feature grouping information. 1050 1051 :Parameters: 1052 featuresmapper : Mapper 1053 `Mapper` to somehow transform each sample's features 1054 samplesmapper : Mapper 1055 `Mapper` to transform each feature across samples 1056 train : bool 1057 Flag whether to train the mapper with this dataset before applying 1058 it. 1059 1060 TODO: selectFeatures is pretty much 1061 applyMapper(featuresmapper=MaskMapper(...)) 1062 """ 1063 1064 # shallow-copy all stuff from current data dict 1065 new_data = self._data.copy() 1066 1067 # apply mappers 1068 1069 if samplesmapper: 1070 if __debug__: 1071 debug("DS", "Training samplesmapper %s" % `samplesmapper`) 1072 samplesmapper.train(self) 1073 1074 if __debug__: 1075 debug("DS", "Applying samplesmapper %s" % `samplesmapper` + 1076 " to samples of dataset `%s`" % `self`) 1077 1078 # get rid of existing 'origids' as they are not valid anymore and 1079 # applying a mapper to them is not really meaningful 1080 if new_data.has_key('origids'): 1081 del(new_data['origids']) 1082 1083 # apply mapper to all sample-wise data in dataset 1084 for k in new_data.keys(): 1085 new_data[k] = samplesmapper.forward(self._data[k]) 1086 1087 # feature mapping might affect dataset attributes 1088 # XXX: might be obsolete when proper feature attributes are implemented 1089 new_dsattr = self._dsattr 1090 1091 if featuresmapper: 1092 if __debug__: 1093 debug("DS", "Training featuresmapper %s" % `featuresmapper`) 1094 featuresmapper.train(self) 1095 1096 if __debug__: 1097 debug("DS", "Applying featuresmapper %s" % `featuresmapper` + 1098 " to samples of dataset `%s`" % `self`) 1099 new_data['samples'] = featuresmapper.forward(self._data['samples']) 1100 1101 # remove feature grouping, who knows what the mapper did to the 1102 # features 1103 if self._dsattr.has_key('featuregroups'): 1104 new_dsattr = self._dsattr.copy() 1105 del(new_dsattr['featuregroups']) 1106 else: 1107 new_dsattr = self._dsattr 1108 1109 # create a new object of the same type it is now and NOT only Dataset 1110 dataset = super(Dataset, self).__new__(self.__class__) 1111 1112 # now init it: to make it work all Dataset contructors have to accept 1113 # Class(data=Dict, dsattr=Dict) 1114 dataset.__init__(data=new_data, 1115 dsattr=new_dsattr, 1116 check_data=False, 1117 copy_samples=False, 1118 copy_data=False, 1119 copy_dsattr=False 1120 ) 1121 1122 # samples attributes might have changed after applying samplesmapper 1123 if samplesmapper: 1124 dataset._resetallunique(force=True) 1125 1126 return dataset
1127 1128
1129 - def selectSamples(self, ids):
1130 """Choose a subset of samples defined by samples IDs. 1131 1132 Returns a new dataset object containing the selected sample 1133 subset. 1134 1135 TODO: yoh, we might need to sort the mask if the mask is a 1136 list of ids and is not ordered. Clarify with Michael what is 1137 our intent here! 1138 """ 1139 # without having a sequence a index the masked sample array would 1140 # loose its 2d layout 1141 if not operator.isSequenceType( ids ): 1142 ids = [ids] 1143 # TODO: Reconsider crafting a slice if it can be done to don't copy 1144 # the data 1145 #try: 1146 # minmask = min(mask) 1147 # maxmask = max(mask) 1148 #except: 1149 # minmask = min(map(int,mask)) 1150 # maxmask = max(map(int,mask)) 1151 # lets see if we could get it done with cheap view/slice 1152 #(minmask, maxmask) != (0, 1) and \ 1153 #if len(mask) > 2 and \ 1154 # N.array([N.arange(minmask, maxmask+1) == N.array(mask)]).all(): 1155 # slice_ = slice(minmask, maxmask+1) 1156 # if __debug__: 1157 # debug("DS", "We can and do convert mask %s into splice %s" % 1158 # (mask, slice_)) 1159 # mask = slice_ 1160 # mask all sample attributes 1161 data = {} 1162 for k, v in self._data.iteritems(): 1163 data[k] = v[ids, ] 1164 1165 # create a new object of the same type it is now and NOT onyl Dataset 1166 dataset = super(Dataset, self).__new__(self.__class__) 1167 1168 # now init it: to make it work all Dataset contructors have to accept 1169 # Class(data=Dict, dsattr=Dict) 1170 dataset.__init__(data=data, 1171 dsattr=self._dsattr, 1172 check_data=False, 1173 copy_samples=False, 1174 copy_data=False, 1175 copy_dsattr=False) 1176 1177 dataset._resetallunique(force=True) 1178 return dataset
1179 1180 1181
1182 - def index(self, *args, **kwargs):
1183 """Universal indexer to obtain indexes of interesting samples/features. 1184 See .select() for more information 1185 1186 :Return: tuple of (samples indexes, features indexes). Each 1187 item could be also None, if no selection on samples or 1188 features was requested (to discriminate between no selected 1189 items, and no selections) 1190 """ 1191 s_indx = [] # selections for samples 1192 f_indx = [] # selections for features 1193 return_dataset = kwargs.pop('return_dataset', False) 1194 largs = len(args) 1195 1196 args = list(args) # so we could override 1197 # Figure out number of positional 1198 largs_nonstring = 0 1199 # need to go with index since we might need to override internally 1200 for i in xrange(largs): 1201 l = args[i] 1202 if isinstance(l, basestring): 1203 if l.lower() == 'all': 1204 # override with a slice 1205 args[i] = slice(None) 1206 else: 1207 break 1208 largs_nonstring += 1 1209 1210 if largs_nonstring >= 1: 1211 s_indx.append(args[0]) 1212 if __debug__ and 'CHECK_DS_SELECT' in debug.active: 1213 _validate_indexes_uniq_sorted(args[0], 'select', 'samples') 1214 if largs_nonstring == 2: 1215 f_indx.append(args[1]) 1216 if __debug__ and 'CHECK_DS_SELECT' in debug.active: 1217 _validate_indexes_uniq_sorted(args[1], 'select', 'features') 1218 elif largs_nonstring > 2: 1219 raise ValueError, "Only two positional arguments are allowed" \ 1220 ". 1st for samples, 2nd for features" 1221 1222 # process left positional arguments which must encode selections like 1223 # ('labels', [1,2,3]) 1224 1225 if (largs - largs_nonstring) % 2 != 0: 1226 raise ValueError, "Positional selections must come in pairs:" \ 1227 " e.g. ('labels', [1,2,3])" 1228 1229 for i in xrange(largs_nonstring, largs, 2): 1230 k, v = args[i:i+2] 1231 kwargs[k] = v 1232 1233 # process keyword parameters 1234 data_ = self._data 1235 for k, v in kwargs.iteritems(): 1236 if k == 'samples': 1237 s_indx.append(v) 1238 elif k == 'features': 1239 f_indx.append(v) 1240 elif data_.has_key(k): 1241 # so it is an attribute for samples 1242 # XXX may be do it not only if __debug__ 1243 if __debug__: # and 'CHECK_DS_SELECT' in debug.active: 1244 if not N.any([isinstance(v, cls) for cls in 1245 [list, tuple, slice, int]]): 1246 raise ValueError, "Trying to specify selection for %s " \ 1247 "based on unsupported '%s'" % (k, v) 1248 s_indx.append(self._getSampleIdsByAttr(v, attrib=k, sort=False)) 1249 else: 1250 raise ValueError, 'Keyword "%s" is not known, thus' \ 1251 'select() failed' % k 1252 1253 def combine_indexes(indx, nelements): 1254 """Helper function: intersect selections given in indx 1255 1256 :Parameters: 1257 indxs : list of lists or slices 1258 selections of elements 1259 nelements : int 1260 number of elements total for deriving indexes from slices 1261 """ 1262 indx_sel = None # pure list of ids for selection 1263 for s in indx: 1264 if isinstance(s, slice) or \ 1265 isinstance(s, N.ndarray) and s.dtype==bool: 1266 # XXX there might be a better way than reconstructing the full 1267 # index list. Also we are loosing ability to do simlpe slicing, 1268 # ie w.o making a copy of the selected data 1269 all_indexes = N.arange(nelements) 1270 s = all_indexes[s] 1271 elif not operator.isSequenceType(s): 1272 s = [ s ] 1273 1274 if indx_sel is None: 1275 indx_sel = Set(s) 1276 else: 1277 # To be consistent 1278 #if not isinstance(indx_sel, Set): 1279 # indx_sel = Set(indx_sel) 1280 indx_sel = indx_sel.intersection(s) 1281 1282 # if we got Set -- convert 1283 if isinstance(indx_sel, Set): 1284 indx_sel = list(indx_sel) 1285 1286 # sort for the sake of sanity 1287 indx_sel.sort() 1288 1289 return indx_sel
1290 1291 # Select samples 1292 if len(s_indx) == 1 and isinstance(s_indx[0], slice) \ 1293 and s_indx[0] == slice(None): 1294 # so no actual selection -- full slice 1295 s_indx = s_indx[0] 1296 else: 1297 # else - get indexes 1298 if len(s_indx) == 0: 1299 s_indx = None 1300 else: 1301 s_indx = combine_indexes(s_indx, self.nsamples) 1302 1303 # Select features 1304 if len(f_indx): 1305 f_indx = combine_indexes(f_indx, self.nfeatures) 1306 else: 1307 f_indx = None 1308 1309 return s_indx, f_indx 1310 1311
1312 - def select(self, *args, **kwargs):
1313 """Universal selector 1314 1315 WARNING: if you need to select duplicate samples 1316 (e.g. samples=[5,5]) or order of selected samples of features 1317 is important and has to be not ordered (e.g. samples=[3,2,1]), 1318 please use selectFeatures or selectSamples functions directly 1319 1320 Examples: 1321 Mimique plain selectSamples:: 1322 1323 dataset.select([1,2,3]) 1324 dataset[[1,2,3]] 1325 1326 Mimique plain selectFeatures:: 1327 1328 dataset.select(slice(None), [1,2,3]) 1329 dataset.select('all', [1,2,3]) 1330 dataset[:, [1,2,3]] 1331 1332 Mixed (select features and samples):: 1333 1334 dataset.select([1,2,3], [1, 2]) 1335 dataset[[1,2,3], [1, 2]] 1336 1337 Select samples matching some attributes:: 1338 1339 dataset.select(labels=[1,2], chunks=[2,4]) 1340 dataset.select('labels', [1,2], 'chunks', [2,4]) 1341 dataset['labels', [1,2], 'chunks', [2,4]] 1342 1343 Mixed -- out of first 100 samples, select only those with 1344 labels 1 or 2 and belonging to chunks 2 or 4, and select 1345 features 2 and 3:: 1346 1347 dataset.select(slice(0,100), [2,3], labels=[1,2], chunks=[2,4]) 1348 dataset[:100, [2,3], 'labels', [1,2], 'chunks', [2,4]] 1349 1350 """ 1351 s_indx, f_indx = self.index(*args, **kwargs) 1352 1353 # Select samples 1354 if s_indx == slice(None): 1355 # so no actual selection was requested among samples. 1356 # thus proceed with original dataset 1357 if __debug__: 1358 debug('DS', 'in select() not selecting samples') 1359 ds = self 1360 else: 1361 # else do selection 1362 if __debug__: 1363 debug('DS', 'in select() selecting samples given selections' 1364 + str(s_indx)) 1365 ds = self.selectSamples(s_indx) 1366 1367 # Select features 1368 if f_indx is not None: 1369 if __debug__: 1370 debug('DS', 'in select() selecting features given selections' 1371 + str(f_indx)) 1372 ds = ds.selectFeatures(f_indx) 1373 1374 return ds
1375 1376 1377
1378 - def where(self, *args, **kwargs):
1379 """Obtain indexes of interesting samples/features. See select() for more information 1380 1381 XXX somewhat obsoletes idsby... 1382 """ 1383 s_indx, f_indx = self.index(*args, **kwargs) 1384 if s_indx is not None and f_indx is not None: 1385 return s_indx, f_indx 1386 elif s_indx is not None: 1387 return s_indx 1388 else: 1389 return f_indx
1390 1391
1392 - def __getitem__(self, *args):
1393 """Convinience dataset parts selection 1394 1395 See select for more information 1396 """ 1397 # for cases like ['labels', 1] 1398 if len(args) == 1 and isinstance(args[0], tuple): 1399 args = args[0] 1400 1401 args_, args = args, () 1402 for a in args_: 1403 if isinstance(a, slice) and \ 1404 isinstance(a.start, basestring): 1405 # for the constructs like ['labels':[1,2]] 1406 if a.stop is None or a.step is not None: 1407 raise ValueError, \ 1408 "Selection must look like ['chunks':[2,3]]" 1409 args += (a.start, a.stop) 1410 else: 1411 args += (a,) 1412 return self.select(*args)
1413 1414
1415 - def permuteLabels(self, status, perchunk=True, assure_permute=False):
1416 """Permute the labels. 1417 1418 TODO: rename status into something closer in semantics. 1419 1420 :Parameters: 1421 status : bool 1422 Calling this method with set to True, the labels are 1423 permuted among all samples. If 'status' is False the 1424 original labels are restored. 1425 perchunk : bool 1426 If True permutation is limited to samples sharing the same 1427 chunk value. Therefore only the association of a certain 1428 sample with a label is permuted while keeping the absolute 1429 number of occurences of each label value within a certain 1430 chunk constant. 1431 assure_permute : bool 1432 If True, assures that labels are permutted, ie any one is 1433 different from the original one 1434 """ 1435 # local bindings 1436 _data = self._data 1437 1438 if len(self.uniquelabels)<2: 1439 raise RuntimeError, \ 1440 "Call to permuteLabels is bogus since there is insuficient" \ 1441 " number of labels: %s" % self.uniquelabels 1442 1443 if not status: 1444 # restore originals 1445 if _data.get('origlabels', None) is None: 1446 raise RuntimeError, 'Cannot restore labels. ' \ 1447 'permuteLabels() has never been ' \ 1448 'called with status == True.' 1449 self.labels = _data['origlabels'] 1450 _data.pop('origlabels') 1451 else: 1452 # store orig labels, but only if not yet done, otherwise multiple 1453 # calls with status == True will destroy the original labels 1454 if not _data.has_key('origlabels') \ 1455 or _data['origlabels'] == None: 1456 # bind old labels to origlabels 1457 _data['origlabels'] = _data['labels'] 1458 # copy labels 1459 _data['labels'] = copy.copy(_data['labels']) 1460 1461 labels = _data['labels'] 1462 # now scramble 1463 if perchunk: 1464 for o in self.uniquechunks: 1465 labels[self.chunks == o] = \ 1466 N.random.permutation(labels[self.chunks == o]) 1467 else: 1468 labels = N.random.permutation(labels) 1469 1470 self.labels = labels 1471 1472 if assure_permute: 1473 if not (_data['labels'] != _data['origlabels']).any(): 1474 if not (assure_permute is True): 1475 if assure_permute == 1: 1476 raise RuntimeError, \ 1477 "Cannot assure permutation of labels %s for " \ 1478 "some reason with chunks %s and while " \ 1479 "perchunk=%s . Should not happen" % \ 1480 (self.labels, self.chunks, perchunk) 1481 else: 1482 assure_permute = 11 # make 10 attempts 1483 if __debug__: 1484 debug("DS", "Recalling permute to assure different labels") 1485 self.permuteLabels(status, perchunk=perchunk, 1486 assure_permute=assure_permute-1)
1487 1488
1489 - def getRandomSamples(self, nperlabel):
1490 """Select a random set of samples. 1491 1492 If 'nperlabel' is an integer value, the specified number of samples is 1493 randomly choosen from the group of samples sharing a unique label 1494 value ( total number of selected samples: nperlabel x len(uniquelabels). 1495 1496 If 'nperlabel' is a list which's length has to match the number of 1497 unique label values. In this case 'nperlabel' specifies the number of 1498 samples that shall be selected from the samples with the corresponding 1499 label. 1500 1501 The method returns a Dataset object containing the selected 1502 samples. 1503 """ 1504 # if interger is given take this value for all classes 1505 if isinstance(nperlabel, int): 1506 nperlabel = [ nperlabel for i in self.uniquelabels ] 1507 1508 sample = [] 1509 # for each available class 1510 labels = self.labels 1511 for i, r in enumerate(self.uniquelabels): 1512 # get the list of pattern ids for this class 1513 sample += random.sample( (labels == r).nonzero()[0], 1514 nperlabel[i] ) 1515 1516 return self.selectSamples( sample )
1517 1518 1519 # def _setchunks(self, chunks): 1520 # """Sets chunks and recomputes uniquechunks 1521 # """ 1522 # self._data['chunks'] = N.array(chunks) 1523 # self._dsattr['uniquechunks'] = None # None!since we might not need them 1524 1525
1526 - def getNSamples( self ):
1527 """Currently available number of patterns. 1528 """ 1529 return self._data['samples'].shape[0]
1530 1531
1532 - def getNFeatures( self ):
1533 """Number of features per pattern. 1534 """ 1535 return self._data['samples'].shape[1]
1536 1537
1538 - def getLabelsMap(self):
1539 """Stored labels map (if any) 1540 """ 1541 return self._dsattr.get('labels_map', None)
1542 1543
1544 - def setLabelsMap(self, lm):
1545 """Set labels map. 1546 1547 Checks for the validity of the mapping -- values should cover 1548 all existing labels in the dataset 1549 """ 1550 values = Set(lm.values()) 1551 labels = Set(self.uniquelabels) 1552 if not values.issuperset(labels): 1553 raise ValueError, \ 1554 "Provided mapping %s has some existing labels (out of %s) " \ 1555 "missing from mapping" % (list(values), list(labels)) 1556 self._dsattr['labels_map'] = lm
1557 1558
1559 - def setSamplesDType(self, dtype):
1560 """Set the data type of the samples array. 1561 """ 1562 # local bindings 1563 _data = self._data 1564 1565 if _data['samples'].dtype != dtype: 1566 _data['samples'] = _data['samples'].astype(dtype)
1567 1568
1569 - def defineFeatureGroups(self, definition):
1570 """Assign `definition` to featuregroups 1571 1572 XXX Feature-groups was not finished to be useful 1573 """ 1574 if not len(definition) == self.nfeatures: 1575 raise ValueError, \ 1576 "Length of feature group definition %i " \ 1577 "does not match the number of features %i " \ 1578 % (len(definition), self.nfeatures) 1579 1580 self._dsattr['featuregroups'] = N.array(definition)
1581 1582
1583 - def convertFeatureIds2FeatureMask(self, ids):
1584 """Returns a boolean mask with all features in `ids` selected. 1585 1586 :Parameters: 1587 ids: list or 1d array 1588 To be selected features ids. 1589 1590 :Returns: 1591 ndarray: dtype='bool' 1592 All selected features are set to True; False otherwise. 1593 """ 1594 fmask = N.repeat(False, self.nfeatures) 1595 fmask[ids] = True 1596 1597 return fmask
1598 1599
1600 - def convertFeatureMask2FeatureIds(self, mask):
1601 """Returns feature ids corresponding to non-zero elements in the mask. 1602 1603 :Parameters: 1604 mask: 1d ndarray 1605 Feature mask. 1606 1607 :Returns: 1608 ndarray: integer 1609 Ids of non-zero (non-False) mask elements. 1610 """ 1611 return mask.nonzero()[0]
1612 1613 1614 @staticmethod
1615 - def _checkCopyConstructorArgs(**kwargs):
1616 """Common sanity check for Dataset copy constructor calls.""" 1617 # check if we have samples (somwhere) 1618 samples = None 1619 if kwargs.has_key('samples'): 1620 samples = kwargs['samples'] 1621 if samples is None and kwargs.has_key('data') \ 1622 and kwargs['data'].has_key('samples'): 1623 samples = kwargs['data']['samples'] 1624 if samples is None: 1625 raise DatasetError, \ 1626 "`samples` must be provided to copy constructor call." 1627 1628 if not len(samples.shape) == 2: 1629 raise DatasetError, \ 1630 "samples must be in 2D shape in copy constructor call."
1631 1632 1633 # read-only class properties 1634 nsamples = property( fget=getNSamples ) 1635 nfeatures = property( fget=getNFeatures ) 1636 labels_map = property( fget=getLabelsMap, fset=setLabelsMap ) 1637
1638 -def datasetmethod(func):
1639 """Decorator to easily bind functions to a Dataset class 1640 """ 1641 if __debug__: 1642 debug("DS_", "Binding function %s to Dataset class" % func.func_name) 1643 1644 # Bind the function 1645 setattr(Dataset, func.func_name, func) 1646 1647 # return the original one 1648 return func
1649 1650 1651 # Following attributes adherent to the basic dataset 1652 Dataset._registerAttribute("samples", "_data", abbr='S', hasunique=False) 1653 Dataset._registerAttribute("labels", "_data", abbr='L', hasunique=True) 1654 Dataset._registerAttribute("chunks", "_data", abbr='C', hasunique=True) 1655 # samples ids (already unique by definition) 1656 Dataset._registerAttribute("origids", "_data", abbr='I', hasunique=False) 1657