Package mvpa :: Package clfs :: Module warehouse
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.warehouse

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Collection of classifiers to ease the exploration. 
 10  """ 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  from sets import Set 
 15  import operator 
 16   
 17  # Define sets of classifiers 
 18  from mvpa.clfs.meta import FeatureSelectionClassifier, SplitClassifier, \ 
 19       MulticlassClassifier 
 20  from mvpa.clfs.smlr import SMLR 
 21  from mvpa.clfs.knn import kNN 
 22  from mvpa.clfs.kernel import KernelLinear, KernelSquaredExponential 
 23   
 24  # Helpers 
 25  from mvpa.base import externals, cfg 
 26  from mvpa.measures.anova import OneWayAnova 
 27  from mvpa.misc.transformers import Absolute 
 28  from mvpa.clfs.smlr import SMLRWeights 
 29  from mvpa.featsel.helpers import FractionTailSelector, \ 
 30      FixedNElementTailSelector, RangeElementSelector 
 31   
 32  from mvpa.featsel.base import SensitivityBasedFeatureSelection 
 33   
 34  _KNOWN_INTERNALS = [ 'knn', 'binary', 'svm', 'linear', 
 35          'smlr', 'does_feature_selection', 'has_sensitivity', 
 36          'multiclass', 'non-linear', 'kernel-based', 'lars', 
 37          'regression', 'libsvm', 'sg', 'meta', 'retrainable', 'gpr', 
 38          'notrain2predict', 'ridge', 'blr', 'gnpp'] 
 39   
40 -class Warehouse(object):
41 """Class to keep known instantiated classifiers 42 43 Should provide easy ways to select classifiers of needed kind: 44 clfswh['linear', 'svm'] should return all linear SVMs 45 clfswh['linear', 'multiclass'] should return all linear classifiers 46 capable of doing multiclass classification 47 """ 48
49 - def __init__(self, known_tags=None, matches=None):
50 """Initialize warehouse 51 52 :Parameters: 53 known_tags : list of basestring 54 List of known tags 55 matches : dict 56 Optional dictionary of additional matches. E.g. since any 57 regression can be used as a binary classifier, 58 matches={'binary':['regression']}, would allow to provide 59 regressions also if 'binary' was requested 60 """ 61 self._known_tags = Set(known_tags) 62 self.__items = [] 63 self.__keys = Set() 64 if matches is None: 65 matches = {} 66 self.__matches = matches
67
68 - def __getitem__(self, *args):
69 if isinstance(args[0], tuple): 70 args = args[0] 71 72 # so we explicitely handle [:] 73 if args == (slice(None),): 74 args = [] 75 76 # lets remove optional modifier '!' 77 dargs = Set([str(x).lstrip('!') for x in args]).difference( 78 self._known_tags) 79 80 if len(dargs)>0: 81 raise ValueError, "Unknown internals %s requested. Known are %s" % \ 82 (list(dargs), list(self._known_tags)) 83 84 # dummy implementation for now 85 result = [] 86 # check every known item 87 for item in self.__items: 88 good = True 89 # by default each one counts 90 for arg in args: 91 # check for rejection first 92 if arg.startswith('!'): 93 if (arg[1:] in item._clf_internals): 94 good = False 95 break 96 else: 97 continue 98 # check for inclusion 99 found = False 100 for arg in [arg] + self.__matches.get(arg, []): 101 if (arg in item._clf_internals): 102 found = True 103 break 104 good = found 105 if not good: 106 break 107 if good: 108 result.append(item) 109 return result
110
111 - def __iadd__(self, item):
112 if operator.isSequenceType(item): 113 for item_ in item: 114 self.__iadd__(item_) 115 else: 116 if not hasattr(item, '_clf_internals'): 117 raise ValueError, "Cannot register %s " % item + \ 118 "which has no _clf_internals defined" 119 if len(item._clf_internals) == 0: 120 raise ValueError, "Cannot register %s " % item + \ 121 "which has empty _clf_internals" 122 clf_internals = Set(item._clf_internals) 123 if clf_internals.issubset(self._known_tags): 124 self.__items.append(item) 125 self.__keys |= clf_internals 126 else: 127 raise ValueError, 'Unknown clf internal(s) %s' % \ 128 clf_internals.difference(self._known_tags) 129 return self
130 131 @property
132 - def internals(self):
133 """Known internal tags of the classifiers 134 """ 135 return self.__keys
136
137 - def listing(self):
138 """Listing (description + internals) of registered items 139 """ 140 return [(x.descr, x._clf_internals) for x in self.__items]
141 142 @property
143 - def items(self):
144 """Registered items 145 """ 146 return self.__items
147 148 clfswh = Warehouse(known_tags=_KNOWN_INTERNALS) # classifiers 149 regrswh = Warehouse(known_tags=_KNOWN_INTERNALS) # regressions 150 151 # NB: 152 # - Nu-classifiers are turned off since for haxby DS default nu 153 # is an 'infisible' one 154 # - Python's SMLR is turned off for the duration of development 155 # since it is slow and results should be the same as of C version 156 # 157 clfswh += [ SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"), 158 SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"), 159 #SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"), 160 #SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"), 161 #SMLR(implementation="Python", descr="SMLR(Python)") 162 ] 163 164 clfswh += \ 165 [ MulticlassClassifier(clfswh['smlr'][0], 166 descr='Pairs+maxvote multiclass on ' + \ 167 clfswh['smlr'][0].descr) ] 168 169 if externals.exists('libsvm'): 170 from mvpa.clfs import libsvmc as libsvm 171 clfswh._known_tags.union_update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys()) 172 clfswh += [libsvm.SVM(descr="libsvm.LinSVM(C=def)", probability=1), 173 libsvm.SVM( 174 C=-10.0, descr="libsvm.LinSVM(C=10*def)", probability=1), 175 libsvm.SVM( 176 C=1.0, descr="libsvm.LinSVM(C=1)", probability=1), 177 libsvm.SVM(svm_impl='NU_SVC', 178 descr="libsvm.LinNuSVM(nu=def)", probability=1) 179 ] 180 clfswh += [libsvm.SVM(kernel_type='RBF', descr="libsvm.RbfSVM()"), 181 libsvm.SVM(kernel_type='RBF', svm_impl='NU_SVC', 182 descr="libsvm.RbfNuSVM(nu=def)"), 183 libsvm.SVM(kernel_type='poly', 184 descr='libsvm.PolySVM()', probability=1), 185 #libsvm.svm.SVM(kernel_type='sigmoid', 186 # svm_impl='C_SVC', 187 # descr='libsvm.SigmoidSVM()'), 188 ] 189 190 # regressions 191 regrswh._known_tags.union_update(['EPSILON_SVR', 'NU_SVR']) 192 regrswh += [libsvm.SVM(svm_impl='EPSILON_SVR', descr='libsvm epsilon-SVR', 193 regression=True), 194 libsvm.SVM(svm_impl='NU_SVR', descr='libsvm nu-SVR', 195 regression=True)] 196 197 if externals.exists('shogun'): 198 from mvpa.clfs import sg 199 clfswh._known_tags.union_update(sg.SVM._KNOWN_IMPLEMENTATIONS) 200 201 # some classifiers are not yet ready to be used out-of-the-box in 202 # PyMVPA, thus we don't populate warehouse with their instances 203 bad_classifiers = [ 204 'mpd', # was segfault, now non-training on testcases, and XOR. 205 # and was described as "for educational purposes", thus 206 # shouldn't be used for real data ;-) 207 # Should be a drop-in replacement for lightsvm 208 'gpbt', # fails to train for testAnalyzerWithSplitClassifier 209 # also 'retraining' doesn't work -- fails to generalize 210 'gmnp', # would fail with 'assertion Cache_Size > 2' 211 # if shogun < 0.6.3, also refuses to train 212 'svrlight', # fails to 'generalize' as a binary classifier 213 # after 'binning' 214 'krr', # fails to generalize 215 ] 216 if not externals.exists('sg_fixedcachesize'): 217 # would fail with 'assertion Cache_Size > 2' if shogun < 0.6.3 218 bad_classifiers.append('gnpp') 219 220 for impl in sg.SVM._KNOWN_IMPLEMENTATIONS: 221 # Uncomment the ones to disable 222 if impl in bad_classifiers: 223 continue 224 clfswh += [ 225 sg.SVM( 226 descr="sg.LinSVM(C=def)/%s" % impl, svm_impl=impl), 227 sg.SVM( 228 C=-10.0, descr="sg.LinSVM(C=10*def)/%s" % impl, svm_impl=impl), 229 sg.SVM( 230 C=1.0, descr="sg.LinSVM(C=1)/%s" % impl, svm_impl=impl), 231 ] 232 clfswh += [ 233 sg.SVM(kernel_type='RBF', 234 descr="sg.RbfSVM()/%s" % impl, svm_impl=impl), 235 # sg.SVM(kernel_type='RBF', 236 # descr="sg.RbfSVM(gamma=0.1)/%s" 237 # % impl, svm_impl=impl, gamma=0.1), 238 # sg.SVM(descr="sg.SigmoidSVM()/%s" 239 # % impl, svm_impl=impl, kernel_type="sigmoid"), 240 ] 241 242 for impl in ['libsvr', 'krr']:# \ 243 # XXX svrlight sucks in SG -- dont' have time to figure it out 244 #+ ([], ['svrlight'])['svrlight' in sg.SVM._KNOWN_IMPLEMENTATIONS]: 245 regrswh._known_tags.union_update([impl]) 246 regrswh += [ sg.SVM(svm_impl=impl, descr='sg.LinSVMR()/%s' % impl, 247 regression=True), 248 #sg.SVM(svm_impl=impl, kernel_type='RBF', 249 # descr='sg.RBFSVMR()/%s' % impl, 250 # regression=True), 251 ] 252 253 if len(clfswh['svm', 'linear']) > 0: 254 # if any SVM implementation is known, import default ones 255 from mvpa.clfs.svm import * 256 257 # lars from R via RPy 258 if externals.exists('lars'): 259 import mvpa.clfs.lars as lars 260 from mvpa.clfs.lars import LARS 261 for model in lars.known_models: 262 # XXX create proper repository of classifiers! 263 lars = LARS(descr="LARS(%s)" % model, model_type=model) 264 clfswh += lars 265 # clfswh += MulticlassClassifier(lars, 266 # descr='Multiclass %s' % lars.descr) 267 268 # kNN 269 clfswh += kNN(k=5, descr="kNN(k=5)") 270 271 clfswh += \ 272 FeatureSelectionClassifier( 273 kNN(), 274 SensitivityBasedFeatureSelection( 275 SMLRWeights(SMLR(lm=1.0, implementation="C")), 276 RangeElementSelector(mode='select')), 277 descr="kNN on SMLR(lm=1) non-0") 278 279 clfswh += \ 280 FeatureSelectionClassifier( 281 kNN(), 282 SensitivityBasedFeatureSelection( 283 OneWayAnova(), 284 FractionTailSelector(0.05, mode='select', tail='upper')), 285 descr="kNN on 5%(ANOVA)") 286 287 clfswh += \ 288 FeatureSelectionClassifier( 289 kNN(), 290 SensitivityBasedFeatureSelection( 291 OneWayAnova(), 292 FixedNElementTailSelector(50, mode='select', tail='upper')), 293 descr="kNN on 50(ANOVA)") 294 295 296 # GPR 297 if externals.exists('scipy'): 298 from mvpa.clfs.gpr import GPR 299 300 clfswh += GPR(kernel=KernelLinear(), descr="GPR(kernel='linear')") 301 clfswh += GPR(kernel=KernelSquaredExponential(), 302 descr="GPR(kernel='sqexp')") 303 304 # BLR 305 from mvpa.clfs.blr import BLR 306 clfswh += BLR(descr="BLR()") 307 308 309 # SVM stuff 310 311 if len(clfswh['linear', 'svm']) > 0: 312 313 linearSVMC = clfswh['linear', 'svm', 314 cfg.get('svm', 'backend', default='libsvm').lower() 315 ][0] 316 317 # "Interesting" classifiers 318 clfswh += \ 319 FeatureSelectionClassifier( 320 linearSVMC, 321 SensitivityBasedFeatureSelection( 322 SMLRWeights(SMLR(lm=0.1, implementation="C")), 323 RangeElementSelector(mode='select')), 324 descr="LinSVM on SMLR(lm=0.1) non-0") 325 326 327 clfswh += \ 328 FeatureSelectionClassifier( 329 linearSVMC, 330 SensitivityBasedFeatureSelection( 331 SMLRWeights(SMLR(lm=1.0, implementation="C")), 332 RangeElementSelector(mode='select')), 333 descr="LinSVM on SMLR(lm=1) non-0") 334 335 336 # "Interesting" classifiers 337 clfswh += \ 338 FeatureSelectionClassifier( 339 RbfCSVMC(), 340 SensitivityBasedFeatureSelection( 341 SMLRWeights(SMLR(lm=1.0, implementation="C")), 342 RangeElementSelector(mode='select')), 343 descr="RbfSVM on SMLR(lm=1) non-0") 344 345 clfswh += \ 346 FeatureSelectionClassifier( 347 linearSVMC, 348 SensitivityBasedFeatureSelection( 349 OneWayAnova(), 350 FractionTailSelector(0.05, mode='select', tail='upper')), 351 descr="LinSVM on 5%(ANOVA)") 352 353 clfswh += \ 354 FeatureSelectionClassifier( 355 linearSVMC, 356 SensitivityBasedFeatureSelection( 357 OneWayAnova(), 358 FixedNElementTailSelector(50, mode='select', tail='upper')), 359 descr="LinSVM on 50(ANOVA)") 360 361 clfswh += \ 362 FeatureSelectionClassifier( 363 linearSVMC, 364 SensitivityBasedFeatureSelection( 365 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 366 FractionTailSelector(0.05, mode='select', tail='upper')), 367 descr="LinSVM on 5%(SVM)") 368 369 clfswh += \ 370 FeatureSelectionClassifier( 371 linearSVMC, 372 SensitivityBasedFeatureSelection( 373 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 374 FixedNElementTailSelector(50, mode='select', tail='upper')), 375 descr="LinSVM on 50(SVM)") 376 377 378 ### Imports which are specific to RFEs 379 # from mvpa.datasets.splitters import OddEvenSplitter 380 # from mvpa.clfs.transerror import TransferError 381 # from mvpa.featsel.rfe import RFE 382 # from mvpa.featsel.helpers import FixedErrorThresholdStopCrit 383 # from mvpa.clfs.transerror import ConfusionBasedError 384 385 # SVM with unbiased RFE -- transfer-error to another splits, or in 386 # other terms leave-1-out error on the same dataset 387 # Has to be bound outside of the RFE definition since both analyzer and 388 # error should use the same instance. 389 rfesvm_split = SplitClassifier(linearSVMC)#clfswh['LinearSVMC'][0]) 390 391 # "Almost" classical RFE. If this works it would differ only that 392 # our transfer_error is based on internal splitting and classifier used 393 # within RFE is a split classifier and its sensitivities per split will get 394 # averaged 395 # 396 397 #clfswh += \ 398 # FeatureSelectionClassifier( 399 # clf = LinearCSVMC(), #clfswh['LinearSVMC'][0], # we train LinearSVM 400 # feature_selection = RFE( # on features selected via RFE 401 # # based on sensitivity of a clf which does splitting internally 402 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 403 # transfer_error=ConfusionBasedError( 404 # rfesvm_split, 405 # confusion_state="confusion"), 406 # # and whose internal error we use 407 # feature_selector=FractionTailSelector( 408 # 0.2, mode='discard', tail='lower'), 409 # # remove 20% of features at each step 410 # update_sensitivity=True), 411 # # update sensitivity at each step 412 # descr='LinSVM+RFE(splits_avg)' ) 413 # 414 #clfswh += \ 415 # FeatureSelectionClassifier( 416 # clf = LinearCSVMC(), # we train LinearSVM 417 # feature_selection = RFE( # on features selected via RFE 418 # # based on sensitivity of a clf which does splitting internally 419 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 420 # transfer_error=ConfusionBasedError( 421 # rfesvm_split, 422 # confusion_state="confusion"), 423 # # and whose internal error we use 424 # feature_selector=FractionTailSelector( 425 # 0.2, mode='discard', tail='lower'), 426 # # remove 20% of features at each step 427 # update_sensitivity=False), 428 # # update sensitivity at each step 429 # descr='LinSVM+RFE(splits_avg,static)' ) 430 431 rfesvm = LinearCSVMC() 432 433 # This classifier will do RFE while taking transfer error to testing 434 # set of that split. Resultant classifier is voted classifier on top 435 # of all splits, let see what that would do ;-) 436 #clfswh += \ 437 # SplitClassifier( # which does splitting internally 438 # FeatureSelectionClassifier( 439 # clf = LinearCSVMC(), 440 # feature_selection = RFE( # on features selected via RFE 441 # sensitivity_analyzer=\ 442 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 443 # transfer_error=TransferError(rfesvm), 444 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 445 # feature_selector=FractionTailSelector( 446 # 0.2, mode='discard', tail='lower'), 447 # # remove 20% of features at each step 448 # update_sensitivity=True)), 449 # # update sensitivity at each step 450 # descr='LinSVM+RFE(N-Fold)') 451 # 452 # 453 #clfswh += \ 454 # SplitClassifier( # which does splitting internally 455 # FeatureSelectionClassifier( 456 # clf = LinearCSVMC(), 457 # feature_selection = RFE( # on features selected via RFE 458 # sensitivity_analyzer=\ 459 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 460 # transfer_error=TransferError(rfesvm), 461 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 462 # feature_selector=FractionTailSelector( 463 # 0.2, mode='discard', tail='lower'), 464 # # remove 20% of features at each step 465 # update_sensitivity=True)), 466 # # update sensitivity at each step 467 # splitter = OddEvenSplitter(), 468 # descr='LinSVM+RFE(OddEven)') 469