1
2
3
4
5
6
7
8
9 """Classes for meta classifiers -- classifiers which use other classifiers
10
11 Meta Classifiers can be grouped according to their function as
12
13 :group BoostedClassifiers: CombinedClassifier MulticlassClassifier
14 SplitClassifier
15 :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier
16 FeatureSelectionClassifier
17 :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner
18 MaximalVote MeanPrediction
19
20 """
21
22 __docformat__ = 'restructuredtext'
23
24 import operator
25 import numpy as N
26
27 from sets import Set
28
29 from mvpa.misc.args import group_kwargs
30 from mvpa.mappers.mask import MaskMapper
31 from mvpa.datasets.splitters import NFoldSplitter
32 from mvpa.misc.state import StateVariable, Stateful, Harvestable
33
34 from mvpa.clfs.base import Classifier
35 from mvpa.misc.transformers import FirstAxisMean
36
37 from mvpa.measures.base import \
38 BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \
39 MappedClassifierSensitivityAnalyzer
40
41 from mvpa.base import warning
42
43 if __debug__:
44 from mvpa.base import debug
45
46
48 """Classifier containing the farm of other classifiers.
49
50 Should rarely be used directly. Use one of its childs instead
51 """
52
53
54
55 raw_predictions = StateVariable(enabled=False,
56 doc="Predictions obtained from each classifier")
57
58 raw_values = StateVariable(enabled=False,
59 doc="Values obtained from each classifier")
60
61
62 - def __init__(self, clfs=None, propagate_states=True,
63 harvest_attribs=None, copy_attribs='copy',
64 **kwargs):
65 """Initialize the instance.
66
67 :Parameters:
68 clfs : list
69 list of classifier instances to use (slave classifiers)
70 propagate_states : bool
71 either to propagate enabled states into slave classifiers.
72 It is in effect only when slaves get assigned - so if state
73 is enabled not during construction, it would not necessarily
74 propagate into slaves
75 kwargs : dict
76 dict of keyworded arguments which might get used
77 by State or Classifier
78 """
79 if clfs == None:
80 clfs = []
81
82 Classifier.__init__(self, **kwargs)
83 Harvestable.__init__(self, harvest_attribs, copy_attribs)
84
85 self.__clfs = None
86 """Pylint friendly definition of __clfs"""
87
88 self.__propagate_states = propagate_states
89 """Enable current enabled states in slave classifiers"""
90
91 self._setClassifiers(clfs)
92 """Store the list of classifiers"""
93
94
96 if self.__clfs is None or len(self.__clfs)==0:
97
98 prefix_ = []
99 else:
100 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])]
101 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
102
103
105 """Train `BoostedClassifier`
106 """
107 for clf in self.__clfs:
108 clf.train(dataset)
109
110
111 - def _posttrain(self, dataset):
112 """Custom posttrain of `BoostedClassifier`
113
114 Harvest over the trained classifiers if it was asked to so
115 """
116 Classifier._posttrain(self, dataset)
117 if self.states.isEnabled('harvested'):
118 for clf in self.__clfs:
119 self._harvest(locals())
120 if self.params.retrainable:
121 self.__changedData_isset = False
122
123
132
133
151
152
154 """Set the classifiers used by the boosted classifier
155
156 We have to allow to set list of classifiers after the object
157 was actually created. It will be used by
158 MulticlassClassifier
159 """
160 self.__clfs = clfs
161 """Classifiers to use"""
162
163 for flag in ['regression']:
164 values = N.array([clf.params[flag].value for clf in self.__clfs])
165 value = values.any()
166 if __debug__:
167 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers "
168 "%(clfs)s with %(values)s",
169 msgargs={'flag' : flag, 'value' : value,
170 'clfs' : self.__clfs,
171 'values' : values})
172
173 self.params[flag].value = value
174
175
176 if self.__propagate_states:
177 for clf in self.__clfs:
178 clf.states.enable(self.states.enabled, missingok=True)
179
180
181
182 self._clf_internals = [ 'binary', 'multiclass', 'meta' ]
183 if len(clfs)>0:
184 self._clf_internals += self.__clfs[0]._clf_internals
185
196
202
203
204 clfs = property(fget=lambda x:x.__clfs,
205 fset=_setClassifiers,
206 doc="Used classifiers")
207
208
209
211 """Classifier which decorates another classifier
212
213 Possible uses:
214
215 - modify data somehow prior training/testing:
216 * normalization
217 * feature selection
218 * modification
219
220 - optimized classifier?
221
222 """
223
242
243
247
249 s = super(ProxyClassifier, self).summary()
250 if self.trained:
251 s += "\n Slave classifier summary:" + \
252 '\n + %s' % \
253 (self.__clf.summary().replace('\n', '\n |'))
254 return s
255
256
257
259 """Train `ProxyClassifier`
260 """
261
262
263 self.__clf.train(dataset)
264
265
266
267
268
269
270
271
272
273
275 """Predict using `ProxyClassifier`
276 """
277 clf = self.__clf
278 if self.states.isEnabled('values'):
279 clf.states.enable(['values'])
280
281 result = clf.predict(data)
282
283 self.states._copy_states_(self.__clf, ['values'], deep=False)
284 return result
285
286
293
294
295 @group_kwargs(prefixes=['slave_'], passthrough=True)
302
303
304 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
305
306
307
308
309
310
311
313 """Base class for combining decisions of multiple classifiers"""
314
315 - def train(self, clfs, dataset):
316 """PredictionsCombiner might need to be trained
317
318 :Parameters:
319 clfs : list of Classifier
320 List of classifiers to combine. Has to be classifiers (not
321 pure predictions), since combiner might use some other
322 state variables (value's) instead of pure prediction's
323 dataset : Dataset
324 training data in this case
325 """
326 pass
327
328
330 """Call function
331
332 :Parameters:
333 clfs : list of Classifier
334 List of classifiers to combine. Has to be classifiers (not
335 pure predictions), since combiner might use some other
336 state variables (value's) instead of pure prediction's
337 """
338 raise NotImplementedError
339
340
341
343 """Provides a decision using maximal vote rule"""
344
345 predictions = StateVariable(enabled=True,
346 doc="Voted predictions")
347 all_label_counts = StateVariable(enabled=False,
348 doc="Counts across classifiers for each label/sample")
349
351 """XXX Might get a parameter to use raw decision values if
352 voting is not unambigous (ie two classes have equal number of
353 votes
354 """
355 PredictionsCombiner.__init__(self)
356
357
359 """Actuall callable - perform voting
360
361 Extended functionality which might not be needed actually:
362 Since `BinaryClassifier` might return a list of possible
363 predictions (not just a single one), we should consider all of those
364
365 MaximalVote doesn't care about dataset itself
366 """
367 if len(clfs)==0:
368 return []
369
370 all_label_counts = None
371 for clf in clfs:
372
373 if not clf.states.isEnabled("predictions"):
374 raise ValueError, "MaximalVote needs classifiers (such as " + \
375 "%s) with state 'predictions' enabled" % clf
376 predictions = clf.predictions
377 if all_label_counts is None:
378 all_label_counts = [ {} for i in xrange(len(predictions)) ]
379
380
381 for i in xrange(len(predictions)):
382 prediction = predictions[i]
383 if not operator.isSequenceType(prediction):
384 prediction = (prediction,)
385 for label in prediction:
386
387
388 if not all_label_counts[i].has_key(label):
389 all_label_counts[i][label] = 0
390 all_label_counts[i][label] += 1
391
392 predictions = []
393
394 for i in xrange(len(all_label_counts)):
395 label_counts = all_label_counts[i]
396
397
398 maxk = []
399 maxv = -1
400 for k, v in label_counts.iteritems():
401 if v > maxv:
402 maxk = [k]
403 maxv = v
404 elif v == maxv:
405 maxk.append(k)
406
407 assert len(maxk) >= 1, \
408 "We should have obtained at least a single key of max label"
409
410 if len(maxk) > 1:
411 warning("We got multiple labels %s which have the " % maxk +
412 "same maximal vote %d. XXX disambiguate" % maxv)
413 predictions.append(maxk[0])
414
415 self.all_label_counts = all_label_counts
416 self.predictions = predictions
417 return predictions
418
419
420
422 """Provides a decision by taking mean of the results
423 """
424
425 predictions = StateVariable(enabled=True,
426 doc="Mean predictions")
427
429 """Actuall callable - perform meaning
430
431 """
432 if len(clfs)==0:
433 return []
434
435 all_predictions = []
436 for clf in clfs:
437
438 if not clf.states.isEnabled("predictions"):
439 raise ValueError, "MeanPrediction needs classifiers (such " \
440 " as %s) with state 'predictions' enabled" % clf
441 all_predictions.append(clf.predictions)
442
443
444 predictions = N.mean(N.asarray(all_predictions), axis=0)
445 self.predictions = predictions
446 return predictions
447
448
450 """Provides a decision using training a classifier on predictions/values
451
452 TODO: implement
453 """
454
455 predictions = StateVariable(enabled=True,
456 doc="Trained predictions")
457
458
459 - def __init__(self, clf, variables=None):
460 """Initialize `ClassifierCombiner`
461
462 :Parameters:
463 clf : Classifier
464 Classifier to train on the predictions
465 variables : list of basestring
466 List of state variables stored in 'combined' classifiers, which
467 to use as features for training this classifier
468 """
469 PredictionsCombiner.__init__(self)
470
471 self.__clf = clf
472 """Classifier to train on `variables` states of provided classifiers"""
473
474 if variables == None:
475 variables = ['predictions']
476 self.__variables = variables
477 """What state variables of the classifiers to use"""
478
479
481 """It might be needed to untrain used classifier"""
482 if self.__clf:
483 self.__clf.untrain()
484
486 """
487 """
488 if len(clfs)==0:
489 return []
490
491 raise NotImplementedError
492
493
494
496 """`BoostedClassifier` which combines predictions using some
497 `PredictionsCombiner` functor.
498 """
499
500 - def __init__(self, clfs=None, combiner=None, **kwargs):
501 """Initialize the instance.
502
503 :Parameters:
504 clfs : list of Classifier
505 list of classifier instances to use
506 combiner : PredictionsCombiner
507 callable which takes care about combining multiple
508 results into a single one (e.g. maximal vote for
509 classification, MeanPrediction for regression))
510 kwargs : dict
511 dict of keyworded arguments which might get used
512 by State or Classifier
513
514 NB: `combiner` might need to operate not on 'predictions' descrete
515 labels but rather on raw 'class' values classifiers
516 estimate (which is pretty much what is stored under
517 `values`
518 """
519 if clfs == None:
520 clfs = []
521
522 BoostedClassifier.__init__(self, clfs, **kwargs)
523
524
525 if combiner is None:
526 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]()
527 self.__combiner = combiner
528 """Functor destined to combine results of multiple classifiers"""
529
530
532 """Literal representation of `CombinedClassifier`.
533 """
534 return super(CombinedClassifier, self).__repr__(
535 ["combiner=%s" % repr(self.__combiner)] + prefixes)
536
537
539 """Provide summary for the `CombinedClassifier`.
540 """
541 s = super(CombinedClassifier, self).summary()
542 if self.trained:
543 s += "\n Slave classifiers summaries:"
544 for i, clf in enumerate(self.clfs):
545 s += '\n + %d clf: %s' % \
546 (i, clf.summary().replace('\n', '\n |'))
547 return s
548
549
558
565
566
586
587
588 combiner = property(fget=lambda x:x.__combiner,
589 doc="Used combiner to derive a single result")
590
591
592
594 """`ProxyClassifier` which maps set of two labels into +1 and -1
595 """
596
597 - def __init__(self, clf, poslabels, neglabels, **kwargs):
598 """
599 :Parameters:
600 clf : Classifier
601 classifier to use
602 poslabels : list
603 list of labels which are treated as +1 category
604 neglabels : list
605 list of labels which are treated as -1 category
606 """
607
608 ProxyClassifier.__init__(self, clf, **kwargs)
609
610 self._regressionIsBogus()
611
612
613 sposlabels = Set(poslabels)
614 sneglabels = Set(neglabels)
615
616
617 overlap = sposlabels.intersection(sneglabels)
618 if len(overlap)>0:
619 raise ValueError("Sets of positive and negative labels for " +
620 "BinaryClassifier must not overlap. Got overlap " %
621 overlap)
622
623 self.__poslabels = list(sposlabels)
624 self.__neglabels = list(sneglabels)
625
626
627
628
629
630
631
632
633 if len(self.__poslabels) > 1:
634 self.__predictpos = self.__poslabels
635 else:
636 self.__predictpos = self.__poslabels[0]
637
638 if len(self.__neglabels) > 1:
639 self.__predictneg = self.__neglabels
640 else:
641 self.__predictneg = self.__neglabels[0]
642
643
645 prefix = "poslabels=%s, neglabels=%s" % (
646 repr(self.__poslabels), repr(self.__neglabels))
647 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
648
649
651 """Train `BinaryClassifier`
652 """
653 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \
654 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)]
655
656
657 idlabels.sort()
658
659 orig_labels = None
660
661
662
663
664 if len(idlabels) == dataset.nsamples \
665 and [x[0] for x in idlabels] == range(dataset.nsamples):
666
667
668 datasetselected = dataset
669 orig_labels = dataset.labels
670 if __debug__:
671 debug('CLFBIN',
672 "Assigned all %d samples for binary " %
673 (dataset.nsamples) +
674 " classification among labels %s/+1 and %s/-1" %
675 (self.__poslabels, self.__neglabels))
676 else:
677 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ])
678 if __debug__:
679 debug('CLFBIN',
680 "Selected %d samples out of %d samples for binary " %
681 (len(idlabels), dataset.nsamples) +
682 " classification among labels %s/+1 and %s/-1" %
683 (self.__poslabels, self.__neglabels) +
684 ". Selected %s" % datasetselected)
685
686
687 datasetselected.labels = [ x[1] for x in idlabels ]
688
689
690 if __debug__:
691 assert((datasetselected.uniquelabels == [-1, 1]).all())
692
693 self.clf.train(datasetselected)
694
695 if not orig_labels is None:
696 dataset.labels = orig_labels
697
699 """Predict the labels for a given `data`
700
701 Predicts using binary classifier and spits out list (for each sample)
702 where with either poslabels or neglabels as the "label" for the sample.
703 If there was just a single label within pos or neg labels then it would
704 return not a list but just that single label.
705 """
706 binary_predictions = ProxyClassifier._predict(self, data)
707 self.values = binary_predictions
708 predictions = [ {-1: self.__predictneg,
709 +1: self.__predictpos}[x] for x in binary_predictions]
710 self.predictions = predictions
711 return predictions
712
713
714
716 """`CombinedClassifier` to perform multiclass using a list of
717 `BinaryClassifier`.
718
719 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which
720 is yet to think about)
721 """
722
723 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
724 """Initialize the instance
725
726 :Parameters:
727 clf : Classifier
728 classifier based on which multiple classifiers are created
729 for multiclass
730 bclf_type
731 "1-vs-1" or "1-vs-all", determines the way to generate binary
732 classifiers
733 """
734 CombinedClassifier.__init__(self, **kwargs)
735 self._regressionIsBogus()
736 if not clf is None:
737 clf._regressionIsBogus()
738
739 self.__clf = clf
740 """Store sample instance of basic classifier"""
741
742
743 if bclf_type == "1-vs-1":
744 pass
745 elif bclf_type == "1-vs-all":
746 raise NotImplementedError
747 else:
748 raise ValueError, \
749 "Unknown type of classifier %s for " % bclf_type + \
750 "BoostedMulticlassClassifier"
751 self.__bclf_type = bclf_type
752
753
754
756 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type),
757 repr(self.__clf))
758 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
759
760
762 """Train classifier
763 """
764
765 ulabels = dataset.uniquelabels
766 if self.__bclf_type == "1-vs-1":
767
768 biclfs = []
769 for i in xrange(len(ulabels)):
770 for j in xrange(i+1, len(ulabels)):
771 clf = self.__clf.clone()
772 biclfs.append(
773 BinaryClassifier(
774 clf,
775 poslabels=[ulabels[i]], neglabels=[ulabels[j]]))
776 if __debug__:
777 debug("CLFMC", "Created %d binary classifiers for %d labels" %
778 (len(biclfs), len(ulabels)))
779
780 self.clfs = biclfs
781
782 elif self.__bclf_type == "1-vs-all":
783 raise NotImplementedError
784
785
786 CombinedClassifier._train(self, dataset)
787
788
789
791 """`BoostedClassifier` to work on splits of the data
792
793 """
794
795 """
796 TODO: SplitClassifier and MulticlassClassifier have too much in
797 common -- need to refactor: just need a splitter which would
798 split dataset in pairs of class labels. MulticlassClassifier
799 does just a tiny bit more which might be not necessary at
800 all: map sets of labels into 2 categories...
801 """
802
803
804
805 confusion = StateVariable(enabled=False,
806 doc="Resultant confusion whenever classifier trained " +
807 "on 1 part and tested on 2nd part of each split")
808
809 splits = StateVariable(enabled=False, doc=
810 """Store the actual splits of the data. Can be memory expensive""")
811
812
813
814
815
816
817
818
819
820
821
822
823
825 """Initialize the instance
826
827 :Parameters:
828 clf : Classifier
829 classifier based on which multiple classifiers are created
830 for multiclass
831 splitter : Splitter
832 `Splitter` to use to split the dataset prior training
833 """
834
835 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs)
836 self.__clf = clf
837 """Store sample instance of basic classifier"""
838
839 if isinstance(splitter, type):
840 raise ValueError, \
841 "Please provide an instance of a splitter, not a type." \
842 " Got %s" % splitter
843
844 self.__splitter = splitter
845
846
913
914
915 @group_kwargs(prefixes=['slave_'], passthrough=True)
928
929 splitter = property(fget=lambda x:x.__splitter,
930 doc="Splitter user by SplitClassifier")
931
932
934 """`ProxyClassifier` which uses some mapper prior training/testing.
935
936 `MaskMapper` can be used just a subset of features to
937 train/classify.
938 Having such classifier we can easily create a set of classifiers
939 for BoostedClassifier, where each classifier operates on some set
940 of features, e.g. set of best spheres from SearchLight, set of
941 ROIs selected elsewhere. It would be different from simply
942 applying whole mask over the dataset, since here initial decision
943 is made by each classifier and then later on they vote for the
944 final decision across the set of classifiers.
945 """
946
947 - def __init__(self, clf, mapper, **kwargs):
948 """Initialize the instance
949
950 :Parameters:
951 clf : Classifier
952 classifier based on which mask classifiers is created
953 mapper
954 whatever `Mapper` comes handy
955 """
956 ProxyClassifier.__init__(self, clf, **kwargs)
957
958 self.__mapper = mapper
959 """mapper to help us our with prepping data to
960 training/classification"""
961
962
964 """Train `MappedClassifier`
965 """
966
967
968
969 self.__mapper.train(dataset)
970
971
972 wdataset = dataset.applyMapper(featuresmapper = self.__mapper)
973 ProxyClassifier._train(self, wdataset)
974
975
980
981
982 @group_kwargs(prefixes=['slave_'], passthrough=True)
989
990
991 mapper = property(lambda x:x.__mapper, doc="Used mapper")
992
993
994
996 """`ProxyClassifier` which uses some `FeatureSelection` prior training.
997
998 `FeatureSelection` is used first to select features for the classifier to
999 use for prediction. Internally it would rely on MappedClassifier which
1000 would use created MaskMapper.
1001
1002 TODO: think about removing overhead of retraining the same classifier if
1003 feature selection was carried out with the same classifier already. It
1004 has been addressed by adding .trained property to classifier, but now
1005 we should expclitely use isTrained here if we want... need to think more
1006 """
1007
1008 _clf_internals = [ 'does_feature_selection', 'meta' ]
1009
1010 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1011 """Initialize the instance
1012
1013 :Parameters:
1014 clf : Classifier
1015 classifier based on which mask classifiers is created
1016 feature_selection : FeatureSelection
1017 whatever `FeatureSelection` comes handy
1018 testdataset : Dataset
1019 optional dataset which would be given on call to feature_selection
1020 """
1021 ProxyClassifier.__init__(self, clf, **kwargs)
1022
1023 self.__maskclf = None
1024 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on."""
1025
1026 self.__feature_selection = feature_selection
1027 """`FeatureSelection` to select the features prior training"""
1028
1029 self.__testdataset = testdataset
1030 """`FeatureSelection` might like to use testdataset"""
1031
1032
1034 """Untrain `FeatureSelectionClassifier`
1035
1036 Has to untrain any known classifier
1037 """
1038 if not self.trained:
1039 return
1040 if not self.__maskclf is None:
1041 self.__maskclf.untrain()
1042 super(FeatureSelectionClassifier, self).untrain()
1043
1044
1046 """Train `FeatureSelectionClassifier`
1047 """
1048
1049 self.__feature_selection.states._changeTemporarily(
1050 enable_states=["selected_ids"])
1051
1052 if __debug__:
1053 debug("CLFFS", "Performing feature selection using %s" %
1054 self.__feature_selection + " on %s" % dataset)
1055
1056 (wdataset, tdataset) = self.__feature_selection(dataset,
1057 self.__testdataset)
1058 if __debug__:
1059 add_ = ""
1060 if "CLFFS_" in debug.active:
1061 add_ = " Selected features: %s" % \
1062 self.__feature_selection.selected_ids
1063 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " +
1064 "%(dsnfeat)d features.%(app)s",
1065 msgargs={'fs':self.__feature_selection,
1066 'nfeat':wdataset.nfeatures,
1067 'dsnfeat':dataset.nfeatures,
1068 'app':add_})
1069
1070
1071
1072 mappermask = N.zeros(dataset.nfeatures)
1073 mappermask[self.__feature_selection.selected_ids] = 1
1074 mapper = MaskMapper(mappermask)
1075
1076 self.__feature_selection.states._resetEnabledTemporarily()
1077
1078
1079 self.__maskclf = MappedClassifier(self.clf, mapper)
1080
1081
1082 self.__maskclf.clf.train(wdataset)
1083
1084
1085
1086
1087
1089 """Return used feature ids for `FeatureSelectionClassifier`
1090
1091 """
1092 return self.__feature_selection.selected_ids
1093
1095 """Predict using `FeatureSelectionClassifier`
1096 """
1097 clf = self.__maskclf
1098 if self.states.isEnabled('values'):
1099 clf.states.enable(['values'])
1100
1101 result = clf._predict(data)
1102
1103 self.states._copy_states_(clf, ['values'], deep=False)
1104 return result
1105
1107 """Set testing dataset to be used for feature selection
1108 """
1109 self.__testdataset = testdataset
1110
1111 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`")
1112 feature_selection = property(lambda x:x.__feature_selection,
1113 doc="Used `FeatureSelection`")
1114
1115 @group_kwargs(prefixes=['slave_'], passthrough=True)
1125
1126
1127
1128 testdataset = property(fget=lambda x:x.__testdataset,
1129 fset=setTestDataset)
1130