Package mvpa :: Package clfs :: Module meta
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.meta

   1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  #ex: set sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Classes for meta classifiers -- classifiers which use other classifiers 
  10   
  11  Meta Classifiers can be grouped according to their function as 
  12   
  13  :group BoostedClassifiers: CombinedClassifier MulticlassClassifier 
  14    SplitClassifier 
  15  :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier 
  16    FeatureSelectionClassifier 
  17  :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner 
  18    MaximalVote MeanPrediction 
  19   
  20  """ 
  21   
  22  __docformat__ = 'restructuredtext' 
  23   
  24  import operator 
  25  import numpy as N 
  26   
  27  from sets import Set 
  28   
  29  from mvpa.misc.args import group_kwargs 
  30  from mvpa.mappers.mask import MaskMapper 
  31  from mvpa.datasets.splitters import NFoldSplitter 
  32  from mvpa.misc.state import StateVariable, Stateful, Harvestable 
  33   
  34  from mvpa.clfs.base import Classifier 
  35  from mvpa.misc.transformers import FirstAxisMean 
  36   
  37  from mvpa.measures.base import \ 
  38      BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \ 
  39      MappedClassifierSensitivityAnalyzer 
  40   
  41  from mvpa.base import warning 
  42   
  43  if __debug__: 
  44      from mvpa.base import debug 
  45   
  46   
47 -class BoostedClassifier(Classifier, Harvestable):
48 """Classifier containing the farm of other classifiers. 49 50 Should rarely be used directly. Use one of its childs instead 51 """ 52 53 # should not be needed if we have prediction_values upstairs 54 # raw_predictions should be handled as Harvestable??? 55 raw_predictions = StateVariable(enabled=False, 56 doc="Predictions obtained from each classifier") 57 58 raw_values = StateVariable(enabled=False, 59 doc="Values obtained from each classifier") 60 61
62 - def __init__(self, clfs=None, propagate_states=True, 63 harvest_attribs=None, copy_attribs='copy', 64 **kwargs):
65 """Initialize the instance. 66 67 :Parameters: 68 clfs : list 69 list of classifier instances to use (slave classifiers) 70 propagate_states : bool 71 either to propagate enabled states into slave classifiers. 72 It is in effect only when slaves get assigned - so if state 73 is enabled not during construction, it would not necessarily 74 propagate into slaves 75 kwargs : dict 76 dict of keyworded arguments which might get used 77 by State or Classifier 78 """ 79 if clfs == None: 80 clfs = [] 81 82 Classifier.__init__(self, **kwargs) 83 Harvestable.__init__(self, harvest_attribs, copy_attribs) 84 85 self.__clfs = None 86 """Pylint friendly definition of __clfs""" 87 88 self.__propagate_states = propagate_states 89 """Enable current enabled states in slave classifiers""" 90 91 self._setClassifiers(clfs) 92 """Store the list of classifiers"""
93 94
95 - def __repr__(self, prefixes=[]):
96 if self.__clfs is None or len(self.__clfs)==0: 97 #prefix_ = "clfs=%s" % repr(self.__clfs) 98 prefix_ = [] 99 else: 100 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])] 101 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
102 103
104 - def _train(self, dataset):
105 """Train `BoostedClassifier` 106 """ 107 for clf in self.__clfs: 108 clf.train(dataset)
109 110
111 - def _posttrain(self, dataset):
112 """Custom posttrain of `BoostedClassifier` 113 114 Harvest over the trained classifiers if it was asked to so 115 """ 116 Classifier._posttrain(self, dataset) 117 if self.states.isEnabled('harvested'): 118 for clf in self.__clfs: 119 self._harvest(locals()) 120 if self.params.retrainable: 121 self.__changedData_isset = False
122 123
124 - def _getFeatureIds(self):
125 """Custom _getFeatureIds for `BoostedClassifier` 126 """ 127 # return union of all used features by slave classifiers 128 feature_ids = Set([]) 129 for clf in self.__clfs: 130 feature_ids = feature_ids.union(Set(clf.feature_ids)) 131 return list(feature_ids)
132 133
134 - def _predict(self, data):
135 """Predict using `BoostedClassifier` 136 """ 137 raw_predictions = [ clf.predict(data) for clf in self.__clfs ] 138 self.raw_predictions = raw_predictions 139 assert(len(self.__clfs)>0) 140 if self.states.isEnabled("values"): 141 if N.array([x.states.isEnabled("values") 142 for x in self.__clfs]).all(): 143 values = [ clf.values for clf in self.__clfs ] 144 self.raw_values = values 145 else: 146 warning("One or more classifiers in %s has no 'values' state" % 147 self + "enabled, thus BoostedClassifier can't have" + 148 " 'raw_values' state variable defined") 149 150 return raw_predictions
151 152
153 - def _setClassifiers(self, clfs):
154 """Set the classifiers used by the boosted classifier 155 156 We have to allow to set list of classifiers after the object 157 was actually created. It will be used by 158 MulticlassClassifier 159 """ 160 self.__clfs = clfs 161 """Classifiers to use""" 162 163 for flag in ['regression']: 164 values = N.array([clf.params[flag].value for clf in self.__clfs]) 165 value = values.any() 166 if __debug__: 167 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers " 168 "%(clfs)s with %(values)s", 169 msgargs={'flag' : flag, 'value' : value, 170 'clfs' : self.__clfs, 171 'values' : values}) 172 # set flag if it needs to be trained before predicting 173 self.params[flag].value = value 174 175 # enable corresponding states in the slave-classifiers 176 if self.__propagate_states: 177 for clf in self.__clfs: 178 clf.states.enable(self.states.enabled, missingok=True) 179 180 # adhere to their capabilities + 'multiclass' 181 # XXX do intersection across all classifiers! 182 self._clf_internals = [ 'binary', 'multiclass', 'meta' ] 183 if len(clfs)>0: 184 self._clf_internals += self.__clfs[0]._clf_internals
185
186 - def untrain(self):
187 """Untrain `BoostedClassifier` 188 189 Has to untrain any known classifier 190 """ 191 if not self.trained: 192 return 193 for clf in self.clfs: 194 clf.untrain() 195 super(BoostedClassifier, self).untrain()
196
197 - def getSensitivityAnalyzer(self, **kwargs):
198 """Return an appropriate SensitivityAnalyzer""" 199 return BoostedClassifierSensitivityAnalyzer( 200 self, 201 **kwargs)
202 203 204 clfs = property(fget=lambda x:x.__clfs, 205 fset=_setClassifiers, 206 doc="Used classifiers")
207 208 209
210 -class ProxyClassifier(Classifier):
211 """Classifier which decorates another classifier 212 213 Possible uses: 214 215 - modify data somehow prior training/testing: 216 * normalization 217 * feature selection 218 * modification 219 220 - optimized classifier? 221 222 """ 223
224 - def __init__(self, clf, **kwargs):
225 """Initialize the instance 226 227 :Parameters: 228 clf : Classifier 229 classifier based on which mask classifiers is created 230 """ 231 232 Classifier.__init__(self, regression=clf.regression, **kwargs) 233 234 self.__clf = clf 235 """Store the classifier to use.""" 236 237 # adhere to slave classifier capabilities 238 # TODO: unittest 239 self._clf_internals = self._clf_internals[:] + ['meta'] 240 if clf is not None: 241 self._clf_internals += clf._clf_internals
242 243
244 - def __repr__(self, prefixes=[]):
245 return super(ProxyClassifier, self).__repr__( 246 ["clf=%s" % repr(self.__clf)] + prefixes)
247
248 - def summary(self):
249 s = super(ProxyClassifier, self).summary() 250 if self.trained: 251 s += "\n Slave classifier summary:" + \ 252 '\n + %s' % \ 253 (self.__clf.summary().replace('\n', '\n |')) 254 return s
255 256 257
258 - def _train(self, dataset):
259 """Train `ProxyClassifier` 260 """ 261 # base class does nothing much -- just proxies requests to underlying 262 # classifier 263 self.__clf.train(dataset)
264 265 # for the ease of access 266 # TODO: if to copy we should exclude some states which are defined in 267 # base Classifier (such as training_time, predicting_time) 268 # YOH: for now _copy_states_ would copy only set states variables. If 269 # anything needs to be overriden in the parent's class, it is 270 # welcome to do so 271 #self.states._copy_states_(self.__clf, deep=False) 272 273
274 - def _predict(self, data):
275 """Predict using `ProxyClassifier` 276 """ 277 clf = self.__clf 278 if self.states.isEnabled('values'): 279 clf.states.enable(['values']) 280 281 result = clf.predict(data) 282 # for the ease of access 283 self.states._copy_states_(self.__clf, ['values'], deep=False) 284 return result
285 286
287 - def untrain(self):
288 """Untrain ProxyClassifier 289 """ 290 if not self.__clf is None: 291 self.__clf.untrain() 292 super(ProxyClassifier, self).untrain()
293 294 295 @group_kwargs(prefixes=['slave_'], passthrough=True)
296 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
297 """Return an appropriate SensitivityAnalyzer""" 298 return ProxyClassifierSensitivityAnalyzer( 299 self, 300 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 301 **kwargs)
302 303 304 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
305 306 307 308 # 309 # Various combiners for CombinedClassifier 310 # 311
312 -class PredictionsCombiner(Stateful):
313 """Base class for combining decisions of multiple classifiers""" 314
315 - def train(self, clfs, dataset):
316 """PredictionsCombiner might need to be trained 317 318 :Parameters: 319 clfs : list of Classifier 320 List of classifiers to combine. Has to be classifiers (not 321 pure predictions), since combiner might use some other 322 state variables (value's) instead of pure prediction's 323 dataset : Dataset 324 training data in this case 325 """ 326 pass
327 328
329 - def __call__(self, clfs, dataset):
330 """Call function 331 332 :Parameters: 333 clfs : list of Classifier 334 List of classifiers to combine. Has to be classifiers (not 335 pure predictions), since combiner might use some other 336 state variables (value's) instead of pure prediction's 337 """ 338 raise NotImplementedError
339 340 341
342 -class MaximalVote(PredictionsCombiner):
343 """Provides a decision using maximal vote rule""" 344 345 predictions = StateVariable(enabled=True, 346 doc="Voted predictions") 347 all_label_counts = StateVariable(enabled=False, 348 doc="Counts across classifiers for each label/sample") 349
350 - def __init__(self):
351 """XXX Might get a parameter to use raw decision values if 352 voting is not unambigous (ie two classes have equal number of 353 votes 354 """ 355 PredictionsCombiner.__init__(self)
356 357
358 - def __call__(self, clfs, dataset):
359 """Actuall callable - perform voting 360 361 Extended functionality which might not be needed actually: 362 Since `BinaryClassifier` might return a list of possible 363 predictions (not just a single one), we should consider all of those 364 365 MaximalVote doesn't care about dataset itself 366 """ 367 if len(clfs)==0: 368 return [] # to don't even bother 369 370 all_label_counts = None 371 for clf in clfs: 372 # Lets check first if necessary state variable is enabled 373 if not clf.states.isEnabled("predictions"): 374 raise ValueError, "MaximalVote needs classifiers (such as " + \ 375 "%s) with state 'predictions' enabled" % clf 376 predictions = clf.predictions 377 if all_label_counts is None: 378 all_label_counts = [ {} for i in xrange(len(predictions)) ] 379 380 # for every sample 381 for i in xrange(len(predictions)): 382 prediction = predictions[i] 383 if not operator.isSequenceType(prediction): 384 prediction = (prediction,) 385 for label in prediction: # for every label 386 # XXX we might have multiple labels assigned 387 # but might not -- don't remember now 388 if not all_label_counts[i].has_key(label): 389 all_label_counts[i][label] = 0 390 all_label_counts[i][label] += 1 391 392 predictions = [] 393 # select maximal vote now for each sample 394 for i in xrange(len(all_label_counts)): 395 label_counts = all_label_counts[i] 396 # lets do explicit search for max so we know 397 # if it is unique 398 maxk = [] # labels of elements with max vote 399 maxv = -1 400 for k, v in label_counts.iteritems(): 401 if v > maxv: 402 maxk = [k] 403 maxv = v 404 elif v == maxv: 405 maxk.append(k) 406 407 assert len(maxk) >= 1, \ 408 "We should have obtained at least a single key of max label" 409 410 if len(maxk) > 1: 411 warning("We got multiple labels %s which have the " % maxk + 412 "same maximal vote %d. XXX disambiguate" % maxv) 413 predictions.append(maxk[0]) 414 415 self.all_label_counts = all_label_counts 416 self.predictions = predictions 417 return predictions
418 419 420
421 -class MeanPrediction(PredictionsCombiner):
422 """Provides a decision by taking mean of the results 423 """ 424 425 predictions = StateVariable(enabled=True, 426 doc="Mean predictions") 427
428 - def __call__(self, clfs, dataset):
429 """Actuall callable - perform meaning 430 431 """ 432 if len(clfs)==0: 433 return [] # to don't even bother 434 435 all_predictions = [] 436 for clf in clfs: 437 # Lets check first if necessary state variable is enabled 438 if not clf.states.isEnabled("predictions"): 439 raise ValueError, "MeanPrediction needs classifiers (such " \ 440 " as %s) with state 'predictions' enabled" % clf 441 all_predictions.append(clf.predictions) 442 443 # compute mean 444 predictions = N.mean(N.asarray(all_predictions), axis=0) 445 self.predictions = predictions 446 return predictions
447 448
449 -class ClassifierCombiner(PredictionsCombiner):
450 """Provides a decision using training a classifier on predictions/values 451 452 TODO: implement 453 """ 454 455 predictions = StateVariable(enabled=True, 456 doc="Trained predictions") 457 458
459 - def __init__(self, clf, variables=None):
460 """Initialize `ClassifierCombiner` 461 462 :Parameters: 463 clf : Classifier 464 Classifier to train on the predictions 465 variables : list of basestring 466 List of state variables stored in 'combined' classifiers, which 467 to use as features for training this classifier 468 """ 469 PredictionsCombiner.__init__(self) 470 471 self.__clf = clf 472 """Classifier to train on `variables` states of provided classifiers""" 473 474 if variables == None: 475 variables = ['predictions'] 476 self.__variables = variables 477 """What state variables of the classifiers to use"""
478 479
480 - def untrain(self):
481 """It might be needed to untrain used classifier""" 482 if self.__clf: 483 self.__clf.untrain()
484
485 - def __call__(self, clfs, dataset):
486 """ 487 """ 488 if len(clfs)==0: 489 return [] # to don't even bother 490 491 raise NotImplementedError
492 493 494
495 -class CombinedClassifier(BoostedClassifier):
496 """`BoostedClassifier` which combines predictions using some 497 `PredictionsCombiner` functor. 498 """ 499
500 - def __init__(self, clfs=None, combiner=None, **kwargs):
501 """Initialize the instance. 502 503 :Parameters: 504 clfs : list of Classifier 505 list of classifier instances to use 506 combiner : PredictionsCombiner 507 callable which takes care about combining multiple 508 results into a single one (e.g. maximal vote for 509 classification, MeanPrediction for regression)) 510 kwargs : dict 511 dict of keyworded arguments which might get used 512 by State or Classifier 513 514 NB: `combiner` might need to operate not on 'predictions' descrete 515 labels but rather on raw 'class' values classifiers 516 estimate (which is pretty much what is stored under 517 `values` 518 """ 519 if clfs == None: 520 clfs = [] 521 522 BoostedClassifier.__init__(self, clfs, **kwargs) 523 524 # assign default combiner 525 if combiner is None: 526 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]() 527 self.__combiner = combiner 528 """Functor destined to combine results of multiple classifiers"""
529 530
531 - def __repr__(self, prefixes=[]):
532 """Literal representation of `CombinedClassifier`. 533 """ 534 return super(CombinedClassifier, self).__repr__( 535 ["combiner=%s" % repr(self.__combiner)] + prefixes)
536 537
538 - def summary(self):
539 """Provide summary for the `CombinedClassifier`. 540 """ 541 s = super(CombinedClassifier, self).summary() 542 if self.trained: 543 s += "\n Slave classifiers summaries:" 544 for i, clf in enumerate(self.clfs): 545 s += '\n + %d clf: %s' % \ 546 (i, clf.summary().replace('\n', '\n |')) 547 return s
548 549
550 - def untrain(self):
551 """Untrain `CombinedClassifier` 552 """ 553 try: 554 self.__combiner.untrain() 555 except: 556 pass 557 super(CombinedClassifier, self).untrain()
558
559 - def _train(self, dataset):
560 """Train `CombinedClassifier` 561 """ 562 BoostedClassifier._train(self, dataset) 563 # combiner might need to train as well 564 self.__combiner.train(self.clfs, dataset)
565 566
567 - def _predict(self, data):
568 """Predict using `CombinedClassifier` 569 """ 570 BoostedClassifier._predict(self, data) 571 # combiner will make use of state variables instead of only predictions 572 # returned from _predict 573 predictions = self.__combiner(self.clfs, data) 574 self.predictions = predictions 575 576 if self.states.isEnabled("values"): 577 if self.__combiner.states.isActive("values"): 578 # XXX or may be we could leave simply up to accessing .combiner? 579 self.values = self.__combiner.values 580 else: 581 if __debug__: 582 warning("Boosted classifier %s has 'values' state" % self + 583 " enabled, but combiner has it active, thus no" + 584 " values could be provided directly, access .clfs") 585 return predictions
586 587 588 combiner = property(fget=lambda x:x.__combiner, 589 doc="Used combiner to derive a single result")
590 591 592
593 -class BinaryClassifier(ProxyClassifier):
594 """`ProxyClassifier` which maps set of two labels into +1 and -1 595 """ 596
597 - def __init__(self, clf, poslabels, neglabels, **kwargs):
598 """ 599 :Parameters: 600 clf : Classifier 601 classifier to use 602 poslabels : list 603 list of labels which are treated as +1 category 604 neglabels : list 605 list of labels which are treated as -1 category 606 """ 607 608 ProxyClassifier.__init__(self, clf, **kwargs) 609 610 self._regressionIsBogus() 611 612 # Handle labels 613 sposlabels = Set(poslabels) # so to remove duplicates 614 sneglabels = Set(neglabels) # so to remove duplicates 615 616 # check if there is no overlap 617 overlap = sposlabels.intersection(sneglabels) 618 if len(overlap)>0: 619 raise ValueError("Sets of positive and negative labels for " + 620 "BinaryClassifier must not overlap. Got overlap " % 621 overlap) 622 623 self.__poslabels = list(sposlabels) 624 self.__neglabels = list(sneglabels) 625 626 # define what values will be returned by predict: if there is 627 # a single label - return just it alone, otherwise - whole 628 # list 629 # Such approach might come useful if we use some classifiers 630 # over different subsets of data with some voting later on 631 # (1-vs-therest?) 632 633 if len(self.__poslabels) > 1: 634 self.__predictpos = self.__poslabels 635 else: 636 self.__predictpos = self.__poslabels[0] 637 638 if len(self.__neglabels) > 1: 639 self.__predictneg = self.__neglabels 640 else: 641 self.__predictneg = self.__neglabels[0]
642 643
644 - def __repr__(self, prefixes=[]):
645 prefix = "poslabels=%s, neglabels=%s" % ( 646 repr(self.__poslabels), repr(self.__neglabels)) 647 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
648 649
650 - def _train(self, dataset):
651 """Train `BinaryClassifier` 652 """ 653 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \ 654 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)] 655 # XXX we have to sort ids since at the moment Dataset.selectSamples 656 # doesn't take care about order 657 idlabels.sort() 658 # select the samples 659 orig_labels = None 660 661 # If we need all samples, why simply not perform on original 662 # data, an just store/restore labels. But it really should be done 663 # within Dataset.selectSamples 664 if len(idlabels) == dataset.nsamples \ 665 and [x[0] for x in idlabels] == range(dataset.nsamples): 666 # the last condition is not even necessary... just overly 667 # cautious 668 datasetselected = dataset # no selection is needed 669 orig_labels = dataset.labels # but we would need to restore labels 670 if __debug__: 671 debug('CLFBIN', 672 "Assigned all %d samples for binary " % 673 (dataset.nsamples) + 674 " classification among labels %s/+1 and %s/-1" % 675 (self.__poslabels, self.__neglabels)) 676 else: 677 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ]) 678 if __debug__: 679 debug('CLFBIN', 680 "Selected %d samples out of %d samples for binary " % 681 (len(idlabels), dataset.nsamples) + 682 " classification among labels %s/+1 and %s/-1" % 683 (self.__poslabels, self.__neglabels) + 684 ". Selected %s" % datasetselected) 685 686 # adjust the labels 687 datasetselected.labels = [ x[1] for x in idlabels ] 688 689 # now we got a dataset with only 2 labels 690 if __debug__: 691 assert((datasetselected.uniquelabels == [-1, 1]).all()) 692 693 self.clf.train(datasetselected) 694 695 if not orig_labels is None: 696 dataset.labels = orig_labels
697
698 - def _predict(self, data):
699 """Predict the labels for a given `data` 700 701 Predicts using binary classifier and spits out list (for each sample) 702 where with either poslabels or neglabels as the "label" for the sample. 703 If there was just a single label within pos or neg labels then it would 704 return not a list but just that single label. 705 """ 706 binary_predictions = ProxyClassifier._predict(self, data) 707 self.values = binary_predictions 708 predictions = [ {-1: self.__predictneg, 709 +1: self.__predictpos}[x] for x in binary_predictions] 710 self.predictions = predictions 711 return predictions
712 713 714
715 -class MulticlassClassifier(CombinedClassifier):
716 """`CombinedClassifier` to perform multiclass using a list of 717 `BinaryClassifier`. 718 719 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which 720 is yet to think about) 721 """ 722
723 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
724 """Initialize the instance 725 726 :Parameters: 727 clf : Classifier 728 classifier based on which multiple classifiers are created 729 for multiclass 730 bclf_type 731 "1-vs-1" or "1-vs-all", determines the way to generate binary 732 classifiers 733 """ 734 CombinedClassifier.__init__(self, **kwargs) 735 self._regressionIsBogus() 736 if not clf is None: 737 clf._regressionIsBogus() 738 739 self.__clf = clf 740 """Store sample instance of basic classifier""" 741 742 # Some checks on known ways to do multiclass 743 if bclf_type == "1-vs-1": 744 pass 745 elif bclf_type == "1-vs-all": # TODO 746 raise NotImplementedError 747 else: 748 raise ValueError, \ 749 "Unknown type of classifier %s for " % bclf_type + \ 750 "BoostedMulticlassClassifier" 751 self.__bclf_type = bclf_type
752 753 # XXX fix it up a bit... it seems that MulticlassClassifier should 754 # be actually ProxyClassifier and use BoostedClassifier internally
755 - def __repr__(self, prefixes=[]):
756 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type), 757 repr(self.__clf)) 758 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
759 760
761 - def _train(self, dataset):
762 """Train classifier 763 """ 764 # construct binary classifiers 765 ulabels = dataset.uniquelabels 766 if self.__bclf_type == "1-vs-1": 767 # generate pairs and corresponding classifiers 768 biclfs = [] 769 for i in xrange(len(ulabels)): 770 for j in xrange(i+1, len(ulabels)): 771 clf = self.__clf.clone() 772 biclfs.append( 773 BinaryClassifier( 774 clf, 775 poslabels=[ulabels[i]], neglabels=[ulabels[j]])) 776 if __debug__: 777 debug("CLFMC", "Created %d binary classifiers for %d labels" % 778 (len(biclfs), len(ulabels))) 779 780 self.clfs = biclfs 781 782 elif self.__bclf_type == "1-vs-all": 783 raise NotImplementedError 784 785 # perform actual training 786 CombinedClassifier._train(self, dataset)
787 788 789
790 -class SplitClassifier(CombinedClassifier):
791 """`BoostedClassifier` to work on splits of the data 792 793 """ 794 795 """ 796 TODO: SplitClassifier and MulticlassClassifier have too much in 797 common -- need to refactor: just need a splitter which would 798 split dataset in pairs of class labels. MulticlassClassifier 799 does just a tiny bit more which might be not necessary at 800 all: map sets of labels into 2 categories... 801 """ 802 803 # TODO: unify with CrossValidatedTransferError which now uses 804 # harvest_attribs to expose gathered attributes 805 confusion = StateVariable(enabled=False, 806 doc="Resultant confusion whenever classifier trained " + 807 "on 1 part and tested on 2nd part of each split") 808 809 splits = StateVariable(enabled=False, doc= 810 """Store the actual splits of the data. Can be memory expensive""") 811 812 # ??? couldn't be training_confusion since it has other meaning 813 # here, BUT it is named so within CrossValidatedTransferError 814 # -- unify 815 # decided to go with overriding semantics tiny bit. For split 816 # classifier training_confusion would correspond to summary 817 # over training errors across all splits. Later on if need comes 818 # we might want to implement global_training_confusion which would 819 # correspond to overall confusion on full training dataset as it is 820 # done in base Classifier 821 #global_training_confusion = StateVariable(enabled=False, 822 # doc="Summary over training confusions acquired at each split") 823
824 - def __init__(self, clf, splitter=NFoldSplitter(cvtype=1), **kwargs):
825 """Initialize the instance 826 827 :Parameters: 828 clf : Classifier 829 classifier based on which multiple classifiers are created 830 for multiclass 831 splitter : Splitter 832 `Splitter` to use to split the dataset prior training 833 """ 834 835 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs) 836 self.__clf = clf 837 """Store sample instance of basic classifier""" 838 839 if isinstance(splitter, type): 840 raise ValueError, \ 841 "Please provide an instance of a splitter, not a type." \ 842 " Got %s" % splitter 843 844 self.__splitter = splitter
845 846
847 - def _train(self, dataset):
848 """Train `SplitClassifier` 849 """ 850 # generate pairs and corresponding classifiers 851 bclfs = [] 852 853 # local binding 854 states = self.states 855 856 clf_template = self.__clf 857 if states.isEnabled('confusion'): 858 states.confusion = clf_template._summaryClass() 859 if states.isEnabled('training_confusion'): 860 clf_template.states.enable(['training_confusion']) 861 states.training_confusion = clf_template._summaryClass() 862 863 clf_hastestdataset = hasattr(clf_template, 'testdataset') 864 865 # for proper and easier debugging - first define classifiers and then 866 # train them 867 for split in self.__splitter.splitcfg(dataset): 868 if __debug__: 869 debug("CLFSPL", 870 "Deepcopying %(clf)s for %(sclf)s", 871 msgargs={'clf':clf_template, 872 'sclf':self}) 873 clf = clf_template.clone() 874 bclfs.append(clf) 875 self.clfs = bclfs 876 877 self.splits = [] 878 879 for i, split in enumerate(self.__splitter(dataset)): 880 if __debug__: 881 debug("CLFSPL", "Training classifier for split %d" % (i)) 882 883 if states.isEnabled("splits"): 884 self.splits.append(split) 885 886 clf = self.clfs[i] 887 888 # assign testing dataset if given classifier can digest it 889 if clf_hastestdataset: 890 clf.testdataset = split[1] 891 892 clf.train(split[0]) 893 894 # unbind the testdataset from the classifier 895 if clf_hastestdataset: 896 clf.testdataset = None 897 898 if states.isEnabled("confusion"): 899 predictions = clf.predict(split[1].samples) 900 self.confusion.add(split[1].labels, predictions, 901 clf.states.get('values', None)) 902 if states.isEnabled("training_confusion"): 903 states.training_confusion += \ 904 clf.states.training_confusion 905 # hackish way -- so it should work only for ConfusionMatrix??? 906 try: 907 if states.isEnabled("confusion"): 908 states.confusion.labels_map = dataset.labels_map 909 if states.isEnabled("training_confusion"): 910 states.training_confusion.labels_map = dataset.labels_map 911 except: 912 pass
913 914 915 @group_kwargs(prefixes=['slave_'], passthrough=True)
916 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
917 """Return an appropriate SensitivityAnalyzer for `SplitClassifier` 918 919 :Parameters: 920 combiner 921 If not provided, FirstAxisMean is assumed 922 """ 923 kwargs.setdefault('combiner', FirstAxisMean) 924 return BoostedClassifierSensitivityAnalyzer( 925 self, 926 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 927 **kwargs)
928 929 splitter = property(fget=lambda x:x.__splitter, 930 doc="Splitter user by SplitClassifier")
931 932
933 -class MappedClassifier(ProxyClassifier):
934 """`ProxyClassifier` which uses some mapper prior training/testing. 935 936 `MaskMapper` can be used just a subset of features to 937 train/classify. 938 Having such classifier we can easily create a set of classifiers 939 for BoostedClassifier, where each classifier operates on some set 940 of features, e.g. set of best spheres from SearchLight, set of 941 ROIs selected elsewhere. It would be different from simply 942 applying whole mask over the dataset, since here initial decision 943 is made by each classifier and then later on they vote for the 944 final decision across the set of classifiers. 945 """ 946
947 - def __init__(self, clf, mapper, **kwargs):
948 """Initialize the instance 949 950 :Parameters: 951 clf : Classifier 952 classifier based on which mask classifiers is created 953 mapper 954 whatever `Mapper` comes handy 955 """ 956 ProxyClassifier.__init__(self, clf, **kwargs) 957 958 self.__mapper = mapper 959 """mapper to help us our with prepping data to 960 training/classification"""
961 962
963 - def _train(self, dataset):
964 """Train `MappedClassifier` 965 """ 966 # first train the mapper 967 # XXX: should training be done using whole dataset or just samples 968 # YYY: in some cases labels might be needed, thus better full dataset 969 self.__mapper.train(dataset) 970 971 # for train() we have to provide dataset -- not just samples to train! 972 wdataset = dataset.applyMapper(featuresmapper = self.__mapper) 973 ProxyClassifier._train(self, wdataset)
974 975
976 - def _predict(self, data):
977 """Predict using `MappedClassifier` 978 """ 979 return ProxyClassifier._predict(self, self.__mapper.forward(data))
980 981 982 @group_kwargs(prefixes=['slave_'], passthrough=True)
983 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
984 """Return an appropriate SensitivityAnalyzer""" 985 return MappedClassifierSensitivityAnalyzer( 986 self, 987 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 988 **kwargs)
989 990 991 mapper = property(lambda x:x.__mapper, doc="Used mapper")
992 993 994
995 -class FeatureSelectionClassifier(ProxyClassifier):
996 """`ProxyClassifier` which uses some `FeatureSelection` prior training. 997 998 `FeatureSelection` is used first to select features for the classifier to 999 use for prediction. Internally it would rely on MappedClassifier which 1000 would use created MaskMapper. 1001 1002 TODO: think about removing overhead of retraining the same classifier if 1003 feature selection was carried out with the same classifier already. It 1004 has been addressed by adding .trained property to classifier, but now 1005 we should expclitely use isTrained here if we want... need to think more 1006 """ 1007 1008 _clf_internals = [ 'does_feature_selection', 'meta' ] 1009
1010 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1011 """Initialize the instance 1012 1013 :Parameters: 1014 clf : Classifier 1015 classifier based on which mask classifiers is created 1016 feature_selection : FeatureSelection 1017 whatever `FeatureSelection` comes handy 1018 testdataset : Dataset 1019 optional dataset which would be given on call to feature_selection 1020 """ 1021 ProxyClassifier.__init__(self, clf, **kwargs) 1022 1023 self.__maskclf = None 1024 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on.""" 1025 1026 self.__feature_selection = feature_selection 1027 """`FeatureSelection` to select the features prior training""" 1028 1029 self.__testdataset = testdataset 1030 """`FeatureSelection` might like to use testdataset"""
1031 1032
1033 - def untrain(self):
1034 """Untrain `FeatureSelectionClassifier` 1035 1036 Has to untrain any known classifier 1037 """ 1038 if not self.trained: 1039 return 1040 if not self.__maskclf is None: 1041 self.__maskclf.untrain() 1042 super(FeatureSelectionClassifier, self).untrain()
1043 1044
1045 - def _train(self, dataset):
1046 """Train `FeatureSelectionClassifier` 1047 """ 1048 # temporarily enable selected_ids 1049 self.__feature_selection.states._changeTemporarily( 1050 enable_states=["selected_ids"]) 1051 1052 if __debug__: 1053 debug("CLFFS", "Performing feature selection using %s" % 1054 self.__feature_selection + " on %s" % dataset) 1055 1056 (wdataset, tdataset) = self.__feature_selection(dataset, 1057 self.__testdataset) 1058 if __debug__: 1059 add_ = "" 1060 if "CLFFS_" in debug.active: 1061 add_ = " Selected features: %s" % \ 1062 self.__feature_selection.selected_ids 1063 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " + 1064 "%(dsnfeat)d features.%(app)s", 1065 msgargs={'fs':self.__feature_selection, 1066 'nfeat':wdataset.nfeatures, 1067 'dsnfeat':dataset.nfeatures, 1068 'app':add_}) 1069 1070 # create a mask to devise a mapper 1071 # TODO -- think about making selected_ids a MaskMapper 1072 mappermask = N.zeros(dataset.nfeatures) 1073 mappermask[self.__feature_selection.selected_ids] = 1 1074 mapper = MaskMapper(mappermask) 1075 1076 self.__feature_selection.states._resetEnabledTemporarily() 1077 1078 # create and assign `MappedClassifier` 1079 self.__maskclf = MappedClassifier(self.clf, mapper) 1080 # we could have called self.__clf.train(dataset), but it would 1081 # cause unnecessary masking 1082 self.__maskclf.clf.train(wdataset)
1083 1084 # for the ease of access 1085 # TODO see for ProxyClassifier 1086 #self.states._copy_states_(self.__maskclf, deep=False) 1087
1088 - def _getFeatureIds(self):
1089 """Return used feature ids for `FeatureSelectionClassifier` 1090 1091 """ 1092 return self.__feature_selection.selected_ids
1093
1094 - def _predict(self, data):
1095 """Predict using `FeatureSelectionClassifier` 1096 """ 1097 clf = self.__maskclf 1098 if self.states.isEnabled('values'): 1099 clf.states.enable(['values']) 1100 1101 result = clf._predict(data) 1102 # for the ease of access 1103 self.states._copy_states_(clf, ['values'], deep=False) 1104 return result
1105
1106 - def setTestDataset(self, testdataset):
1107 """Set testing dataset to be used for feature selection 1108 """ 1109 self.__testdataset = testdataset
1110 1111 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`") 1112 feature_selection = property(lambda x:x.__feature_selection, 1113 doc="Used `FeatureSelection`") 1114 1115 @group_kwargs(prefixes=['slave_'], passthrough=True)
1116 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1117 """Return an appropriate SensitivityAnalyzer 1118 1119 had to clone from mapped classifier??? 1120 """ 1121 return MappedClassifierSensitivityAnalyzer( 1122 self, 1123 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1124 **kwargs)
1125 1126 1127 1128 testdataset = property(fget=lambda x:x.__testdataset, 1129 fset=setTestDataset)
1130