Package mvpa :: Package clfs :: Package sg :: Module svm
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.sg.svm

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Wrap the libsvm package into a very simple class interface.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13   
 14  _DEV__doc__ = """ 
 15   
 16  TODOs: 
 17   * dual-license under GPL for use of SG? 
 18   * for recent versions add ability to specify/parametrize normalization 
 19     scheme for the kernel, and reuse 'scale' now for the normalizer 
 20   * Add support for simplified linear classifiers (which do not require 
 21     storing all training SVs/samples to make classification in predict()) 
 22  """ 
 23   
 24  import numpy as N 
 25   
 26   
 27  # Rely on SG 
 28  import shogun.Features 
 29  import shogun.Classifier 
 30  import shogun.Regression 
 31  import shogun.Kernel 
 32  import shogun.Library 
 33   
 34  import operator 
 35   
 36  from mvpa.misc.param import Parameter 
 37  from mvpa.base import warning 
 38   
 39  from mvpa.clfs.meta import MulticlassClassifier 
 40  from mvpa.clfs._svmbase import _SVM 
 41  from mvpa.misc.state import StateVariable 
 42  from mvpa.measures.base import Sensitivity 
 43  from mvpa.base import externals 
 44   
 45  from sens import * 
 46   
 47  if __debug__: 
 48      from mvpa.base import debug 
 49   
 50   
 51   
 52   
53 -def _setdebug(obj, partname):
54 """Helper to set level of debugging output for SG 55 :Parameters: 56 obj 57 In SG debug output seems to be set per every object 58 partname : basestring 59 For what kind of object we are talking about... could be automated 60 later on (TODO) 61 """ 62 debugname = "SG_%s" % partname.upper() 63 64 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"), 65 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")} 66 67 key = __debug__ and debugname in debug.active 68 69 sglevel, slevel, progressfunc = switch[key] 70 71 if __debug__: 72 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" % 73 (partname, `obj`, slevel)) 74 obj.io.set_loglevel(sglevel) 75 try: 76 exec "obj.io.%s_progress()" % progressfunc 77 except: 78 warning("Shogun version installed has no way to enable progress" + 79 " reports")
80 81
82 -def _tosg(data):
83 """Draft helper function to convert data we have into SG suitable format 84 85 TODO: Support different datatypes 86 """ 87 88 if __debug__: 89 debug("SG_", "Converting data for shogun into RealFeatures") 90 91 features = shogun.Features.RealFeatures(data.astype('double').T) 92 93 if __debug__: 94 debug("SG__", "Done converting data for shogun into RealFeatures") 95 _setdebug(features, 'Features') 96 return features
97 98
99 -class SVM(_SVM):
100 """Support Vector Machine Classifier(s) based on Shogun 101 102 This is a simple base interface 103 """ 104 105 num_threads = Parameter(1, 106 min=1, 107 doc='Number of threads to utilize') 108 109 # NOTE: gamma is width in SG notation for RBF(Gaussian) 110 _KERNELS = { "linear": (shogun.Kernel.LinearKernel, ('scale',), LinearSVMWeights), 111 "rbf" : (shogun.Kernel.GaussianKernel, ('gamma',), None), 112 "rbfshift" : (shogun.Kernel.GaussianShiftKernel, ('gamma', 'max_shift', 'shift_step'), None), 113 "sigmoid" : (shogun.Kernel.SigmoidKernel, ('cache_size', 'gamma', 'coef0'), None), 114 } 115 116 _KNOWN_PARAMS = [ 'epsilon' ] 117 _KNOWN_KERNEL_PARAMS = [ ] 118 119 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ] 120 121 if externals.exists('sg >= 0.6.4'): 122 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights) 123 124 # Some words of wisdom from shogun author: 125 # XXX remove after proper comments added to implementations 126 """ 127 If you'd like to train linear SVMs use SGD or OCAS. These are (I am 128 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs 129 with standard additive bias, but will L2 reqularize it - though it 130 should not matter much in practice (although it will give slightly 131 different solutions)). Note that SGD has no stopping criterion (you 132 simply have to specify the number of iterations) and that OCAS has a 133 different stopping condition than svmlight for example which may be more 134 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3 135 for epsilon. 136 137 If you would like to train kernel SVMs use libsvm/gpdt/svmlight - 138 depending on the problem one is faster than the other (hard to say when, 139 I *think* when your dataset is very unbalanced chunking methods like 140 svmlight/gpdt are better), for smaller problems definitely libsvm. 141 142 If you use string kernels then gpdt/svmlight have a special 'linadd' 143 speedup for this (requires sg 0.6.2 - there was some inefficiency in the 144 code for python-modular before that). This is effective for big datasets 145 and (I trained on 10 million strings based on this). 146 147 And yes currently we only implemented parallel training for svmlight, 148 however all SVMs can be evaluated in parallel. 149 """ 150 _KNOWN_IMPLEMENTATIONS = { 151 "libsvm" : (shogun.Classifier.LibSVM, ('C',), ('multiclass', 'binary'), 152 "LIBSVM's C-SVM (L2 soft-margin SVM)"), 153 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',), ('multiclass', 'binary'), 154 "Generalized Nearest Point Problem SVM"), 155 # XXX should have been GPDT, shogun has it fixed since some version 156 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',), 157 "Gradient Projection Decomposition Technique for large-scale SVM problems"), 158 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',), 159 "Generalized Nearest Point Problem SVM"), 160 161 ## TODO: Needs sparse features... 162 # "svmlin" : (shogun.Classifier.SVMLin, ''), 163 # "liblinear" : (shogun.Classifier.LibLinear, ''), 164 # "subgradient" : (shogun.Classifier.SubGradientSVM, ''), 165 ## good 2-class linear SVMs 166 # "ocas" : (shogun.Classifier.SVMOcas, ''), 167 # "sgd" : ( shogun.Classifier.SVMSGD, ''), 168 169 # regressions 170 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',), ('regression',), 171 "LIBSVM's epsilon-SVR"), 172 "krr": (shogun.Regression.KRR, ('tau',), ('regression',), 173 "Kernel Ridge Regression"), 174 } 175 176
177 - def __init__(self, 178 kernel_type='linear', 179 **kwargs):
180 """Interface class to Shogun's classifiers and regressions. 181 182 Default implementation is 'libsvm'. 183 """ 184 185 svm_impl = kwargs.get('svm_impl', 'libsvm').lower() 186 kwargs['svm_impl'] = svm_impl 187 188 # init base class 189 _SVM.__init__(self, kernel_type=kernel_type, **kwargs) 190 191 self.__svm = None 192 """Holds the trained svm.""" 193 194 # Need to store original data... 195 # TODO: keep 1 of them -- just __traindata or __traindataset 196 # For now it is needed for computing sensitivities 197 self.__traindataset = None 198 199 # internal SG swig proxies 200 self.__traindata = None 201 self.__kernel = None 202 self.__kernel_test = None 203 self.__testdata = None
204 205
206 - def __condition_kernel(self, kernel):
207 # XXX I thought that it is needed only for retrainable classifier, 208 # but then krr gets confused, and svrlight needs it to provide 209 # meaningful results even without 'retraining' 210 if self._svm_impl in ['svrlight', 'lightsvm']: 211 kernel.set_precompute_matrix(True, True)
212 213
214 - def _train(self, dataset):
215 """Train SVM 216 """ 217 # XXX watchout 218 # self.untrain() 219 newkernel, newsvm = False, False 220 # local bindings for faster lookup 221 retrainable = self.params.retrainable 222 223 if retrainable: 224 _changedData = self._changedData 225 226 # LABELS 227 ul = None 228 self.__traindataset = dataset 229 230 231 # OK -- we have to map labels since 232 # binary ones expect -1/+1 233 # Multiclass expect labels starting with 0, otherwise they puke 234 # when ran from ipython... yikes 235 if __debug__: 236 debug("SG_", "Creating labels instance") 237 238 if 'regression' in self._clf_internals: 239 labels_ = N.asarray(dataset.labels, dtype='double') 240 else: 241 ul = dataset.uniquelabels 242 ul.sort() 243 244 if len(ul) == 2: 245 # assure that we have -1/+1 246 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0} 247 elif len(ul) < 2: 248 raise ValueError, "we do not have 1-class SVM brought into SG yet" 249 else: 250 # can't use plain enumerate since we need them swapped 251 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))]) 252 253 # reverse labels dict for back mapping in _predict 254 _labels_dict_rev = dict([(x[1], x[0]) 255 for x in _labels_dict.items()]) 256 257 # bind to instance as well 258 self._labels_dict = _labels_dict 259 self._labels_dict_rev = _labels_dict_rev 260 261 # Map labels 262 # 263 # TODO: top level classifier should take care about labels 264 # mapping if that is needed 265 if __debug__: 266 debug("SG__", "Mapping labels using dict %s" % _labels_dict) 267 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double') 268 269 labels = shogun.Features.Labels(labels_) 270 _setdebug(labels, 'Labels') 271 272 273 # KERNEL 274 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']: 275 # If needed compute or just collect arguments for SVM and for 276 # the kernel 277 kargs = [] 278 for arg in self._KERNELS[self._kernel_type_literal][1]: 279 value = self.kernel_params[arg].value 280 # XXX Unify damn automagic gamma value 281 if arg == 'gamma' and value == 0.0: 282 value = self._getDefaultGamma(dataset) 283 kargs += [value] 284 285 if retrainable and __debug__: 286 if _changedData['traindata']: 287 debug("SG", 288 "Re-Creating kernel since training data has changed") 289 290 if _changedData['kernel_params']: 291 debug("SG", 292 "Re-Creating kernel since params %s has changed" % 293 _changedData['kernel_params']) 294 295 # create training data 296 if __debug__: debug("SG_", "Converting input data for shogun") 297 self.__traindata = _tosg(dataset.samples) 298 299 if __debug__: 300 debug("SG", "Creating kernel instance of %s giving arguments %s" % 301 (`self._kernel_type`, kargs)) 302 303 self.__kernel = kernel = \ 304 self._kernel_type(self.__traindata, self.__traindata, 305 *kargs) 306 307 if externals.exists('sg >= 0.6.4'): 308 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer()) 309 310 newkernel = True 311 self.kernel_params.reset() # mark them as not-changed 312 _setdebug(kernel, 'Kernels') 313 314 self.__condition_kernel(kernel) 315 if retrainable: 316 if __debug__: 317 debug("SG_", "Resetting test kernel for retrainable SVM") 318 self.__kernel_test = None 319 self.__kernel_args = kargs 320 321 # TODO -- handle _changedData['params'] correctly, ie without recreating 322 # whole SVM 323 Cs = None 324 if not retrainable or self.__svm is None or _changedData['params']: 325 # SVM 326 if self.params.isKnown('C'): 327 C = self.params.C 328 if not operator.isSequenceType(C): 329 # we were not given a tuple for balancing between classes 330 C = [C] 331 332 Cs = list(C[:]) # copy 333 for i in xrange(len(Cs)): 334 if Cs[i]<0: 335 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i]) 336 if __debug__: 337 debug("SG_", "Default C for %s was computed to be %s" % 338 (C[i], Cs[i])) 339 340 # XXX do not jump over the head and leave it up to the user 341 # ie do not rescale automagically by the number of samples 342 #if len(Cs) == 2 and not ('regression' in self._clf_internals) and len(ul) == 2: 343 # # we were given two Cs 344 # if N.max(C) < 0 and N.min(C) < 0: 345 # # and both are requested to be 'scaled' TODO : 346 # # provide proper 'features' to the parameters, 347 # # so we could specify explicitely if to scale 348 # # them by the number of samples here 349 # nl = [N.sum(labels_ == _labels_dict[l]) for l in ul] 350 # ratio = N.sqrt(float(nl[1]) / nl[0]) 351 # #ratio = (float(nl[1]) / nl[0]) 352 # Cs[0] *= ratio 353 # Cs[1] /= ratio 354 # if __debug__: 355 # debug("SG_", "Rescaled Cs to %s to accomodate the " 356 # "difference in number of training samples" % 357 # Cs) 358 359 # Choose appropriate implementation 360 svm_impl_class = self.__get_implementation(ul) 361 362 if __debug__: 363 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`) 364 365 if self._svm_impl in ['libsvr', 'svrlight']: 366 # for regressions constructor a bit different 367 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels) 368 elif self._svm_impl in ['krr']: 369 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels) 370 else: 371 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels) 372 self.__svm.set_epsilon(self.params.epsilon) 373 if Cs is not None and len(Cs) == 2: 374 if __debug__: 375 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs) 376 self.__svm.set_C(Cs[0], Cs[1]) 377 378 self.params.reset() # mark them as not-changed 379 newsvm = True 380 _setdebug(self.__svm, 'SVM') 381 # Set optimization parameters 382 if self.params.isKnown('tube_epsilon') and \ 383 hasattr(self.__svm, 'set_tube_epsilon'): 384 self.__svm.set_tube_epsilon(self.params.tube_epsilon) 385 self.__svm.parallel.set_num_threads(self.params.num_threads) 386 else: 387 if __debug__: 388 debug("SG_", "SVM instance is not re-created") 389 if _changedData['labels']: # labels were changed 390 if __debug__: debug("SG__", "Assigning new labels") 391 self.__svm.set_labels(labels) 392 if newkernel: # kernel was replaced 393 if __debug__: debug("SG__", "Assigning new kernel") 394 self.__svm.set_kernel(self.__kernel) 395 assert(_changedData['params'] is False) # we should never get here 396 397 if retrainable: 398 # we must assign it only if it is retrainable 399 self.states.retrained = not newsvm or not newkernel 400 401 # Train 402 if __debug__ and 'SG' in debug.active: 403 if not self.regression: 404 lstr = " with labels %s" % dataset.uniquelabels 405 else: 406 lstr = "" 407 debug("SG", "%sTraining %s on data%s" % 408 (("","Re-")[retrainable and self.states.retrained], 409 self, lstr)) 410 411 self.__svm.train() 412 413 if __debug__: 414 debug("SG_", "Done training SG_SVM %s" % self._kernel_type) 415 416 # Report on training 417 if (__debug__ and 'SG__' in debug.active) or \ 418 self.states.isEnabled('training_confusion'): 419 trained_labels = self.__svm.classify().get_labels() 420 else: 421 trained_labels = None 422 423 if __debug__ and "SG__" in debug.active: 424 debug("SG__", "Original labels: %s, Trained labels: %s" % 425 (dataset.labels, trained_labels)) 426 427 # Assign training confusion right away here since we are ready 428 # to do so. 429 # XXX TODO use some other state variable like 'trained_labels' and 430 # use it within base Classifier._posttrain to assign predictions 431 # instead of duplicating code here 432 # XXX For now it can be done only for regressions since labels need to 433 # be remapped and that becomes even worse if we use regression 434 # as a classifier so mapping happens upstairs 435 if self.regression and self.states.isEnabled('training_confusion'): 436 self.states.training_confusion = self._summaryClass( 437 targets=dataset.labels, 438 predictions=trained_labels)
439
440 - def _predict(self, data):
441 """Predict values for the data 442 """ 443 444 retrainable = self.params.retrainable 445 446 if retrainable: 447 changed_testdata = self._changedData['testdata'] or \ 448 self.__kernel_test is None 449 450 if not retrainable or changed_testdata: 451 testdata = _tosg(data) 452 453 if not retrainable: 454 if __debug__: 455 debug("SG__", 456 "Initializing SVMs kernel of %s with training/testing samples" 457 % self) 458 # We can just reuse kernel used for training 459 self.__kernel.init(self.__traindata, testdata) 460 self.__condition_kernel(self.__kernel) 461 else: 462 if changed_testdata: 463 if __debug__: 464 debug("SG__", 465 "Re-creating testing kernel of %s giving " 466 "arguments %s" % 467 (`self._kernel_type`, self.__kernel_args)) 468 kernel_test = self._kernel_type(self.__traindata, testdata, 469 *self.__kernel_args) 470 _setdebug(kernel_test, 'Kernels') 471 472 custk_args = ([self.__traindata, testdata], [])[ 473 int(externals.exists('sg >= 0.6.4'))] 474 if __debug__: 475 debug("SG__", 476 "Re-creating custom testing kernel giving " 477 "arguments %s" % (str(custk_args))) 478 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args) 479 480 _setdebug(kernel_test_custom, 'Kernels') 481 self.__kernel_test = kernel_test_custom 482 self.__kernel_test.set_full_kernel_matrix_from_full( 483 kernel_test.get_kernel_matrix()) 484 elif __debug__: 485 debug("SG__", "Re-using testing kernel") 486 487 assert(self.__kernel_test is not None) 488 self.__svm.set_kernel(self.__kernel_test) 489 490 if __debug__: 491 debug("SG_", "Classifying testing data") 492 493 # doesn't do any good imho although on unittests helps tiny bit... hm 494 #self.__svm.init_kernel_optimization() 495 values_ = self.__svm.classify() 496 if values_ is None: 497 raise RuntimeError, "We got empty list of values from %s" % self 498 499 values = values_.get_labels() 500 501 if retrainable: 502 # we must assign it only if it is retrainable 503 self.states.repredicted = repredicted = not changed_testdata 504 if __debug__: 505 debug("SG__", "Re-assigning learing kernel. Repredicted is %s" 506 % repredicted) 507 # return back original kernel 508 self.__svm.set_kernel(self.__kernel) 509 510 if __debug__: 511 debug("SG__", "Got values %s" % values) 512 513 if ('regression' in self._clf_internals): 514 predictions = values 515 else: 516 # local bindings 517 _labels_dict = self._labels_dict 518 _labels_dict_rev = self._labels_dict_rev 519 520 if len(_labels_dict) == 2: 521 predictions = 1.0 - 2*N.signbit(values) 522 else: 523 predictions = values 524 525 # assure that we have the same type 526 label_type = type(_labels_dict.values()[0]) 527 528 # remap labels back adjusting their type 529 predictions = [_labels_dict_rev[label_type(x)] 530 for x in predictions] 531 532 if __debug__: 533 debug("SG__", "Tuned predictions %s" % predictions) 534 535 # store state variable 536 # TODO: extract values properly for multiclass SVMs -- 537 # ie 1 value per label or pairs for all 1-vs-1 classifications 538 self.values = values 539 540 ## to avoid leaks with not yet properly fixed shogun 541 if not retrainable: 542 try: 543 testdata.free_features() 544 except: 545 pass 546 547 return predictions
548 549
550 - def untrain(self):
551 super(SVM, self).untrain() 552 if not self.params.retrainable: 553 if __debug__: 554 debug("SG__", "Untraining %(clf)s and destroying sg's SVM", 555 msgargs={'clf':self}) 556 557 # to avoid leaks with not yet properly fixed shogun 558 # XXX make it nice... now it is just stable ;-) 559 if True: # not self.__traindata is None: 560 if True: 561 # try: 562 if self.__kernel is not None: 563 del self.__kernel 564 self.__kernel = None 565 566 if self.__kernel_test is not None: 567 del self.__kernel_test 568 self.__kernel_test = None 569 570 if self.__svm is not None: 571 del self.__svm 572 self.__svm = None 573 574 if self.__traindata is not None: 575 # Let in for easy demonstration of the memory leak in shogun 576 #for i in xrange(10): 577 # debug("SG__", "cachesize pre free features %s" % 578 # (self.__svm.get_kernel().get_cache_size())) 579 self.__traindata.free_features() 580 del self.__traindata 581 self.__traindata = None 582 583 self.__traindataset = None 584 585 586 #except: 587 # pass 588 589 if __debug__: 590 debug("SG__", 591 "Done untraining %(self)s and destroying sg's SVM", 592 msgargs=locals()) 593 elif __debug__: 594 debug("SG__", "Not untraining %(self)s since it is retrainable", 595 msgargs=locals())
596 597
598 - def __get_implementation(self, ul):
599 if 'regression' in self._clf_internals or len(ul) == 2: 600 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0] 601 else: 602 if self._svm_impl == 'libsvm': 603 svm_impl_class = shogun.Classifier.LibSVMMultiClass 604 elif self._svm_impl == 'gmnp': 605 svm_impl_class = shogun.Classifier.GMNPSVM 606 else: 607 raise RuntimeError, \ 608 "Shogun: Implementation %s doesn't handle multiclass " \ 609 "data. Got labels %s. Use some other classifier" % \ 610 (self._svm_impl, self.__traindataset.uniquelabels) 611 if __debug__: 612 debug("SG_", "Using %s for multiclass data of %s" % 613 (svm_impl_class, self._svm_impl)) 614 615 return svm_impl_class
616 617 618 svm = property(fget=lambda self: self.__svm) 619 """Access to the SVM model.""" 620 621 traindataset = property(fget=lambda self: self.__traindataset) 622 """Dataset which was used for training 623 624 TODO -- might better become state variable I guess"""
625 626 627 628 # Conditionally make some of the implementations available if they are 629 # present in the present shogun 630 for name, item, params, descr in \ 631 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)", 632 "MPD classifier from shogun"), 633 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)", 634 "SVMLight classification http://svmlight.joachims.org/"), 635 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)", 636 "SVMLight regression http://svmlight.joachims.org/")]: 637 if externals.exists('shogun.%s' % name): 638 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr) 639 640 # Assign SVM class to limited set of LinearSVMWeights 641 LinearSVMWeights._LEGAL_CLFS = [SVM] 642