1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 _DEV__doc__ = """
15
16 TODOs:
17 * dual-license under GPL for use of SG?
18 * for recent versions add ability to specify/parametrize normalization
19 scheme for the kernel, and reuse 'scale' now for the normalizer
20 * Add support for simplified linear classifiers (which do not require
21 storing all training SVs/samples to make classification in predict())
22 """
23
24 import numpy as N
25
26
27
28 import shogun.Features
29 import shogun.Classifier
30 import shogun.Regression
31 import shogun.Kernel
32 import shogun.Library
33
34 import operator
35
36 from mvpa.misc.param import Parameter
37 from mvpa.base import warning
38
39 from mvpa.clfs.meta import MulticlassClassifier
40 from mvpa.clfs._svmbase import _SVM
41 from mvpa.misc.state import StateVariable
42 from mvpa.measures.base import Sensitivity
43 from mvpa.base import externals
44
45 from sens import *
46
47 if __debug__:
48 from mvpa.base import debug
49
50
51
52
54 """Helper to set level of debugging output for SG
55 :Parameters:
56 obj
57 In SG debug output seems to be set per every object
58 partname : basestring
59 For what kind of object we are talking about... could be automated
60 later on (TODO)
61 """
62 debugname = "SG_%s" % partname.upper()
63
64 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"),
65 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")}
66
67 key = __debug__ and debugname in debug.active
68
69 sglevel, slevel, progressfunc = switch[key]
70
71 if __debug__:
72 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
73 (partname, `obj`, slevel))
74 obj.io.set_loglevel(sglevel)
75 try:
76 exec "obj.io.%s_progress()" % progressfunc
77 except:
78 warning("Shogun version installed has no way to enable progress" +
79 " reports")
80
81
83 """Draft helper function to convert data we have into SG suitable format
84
85 TODO: Support different datatypes
86 """
87
88 if __debug__:
89 debug("SG_", "Converting data for shogun into RealFeatures")
90
91 features = shogun.Features.RealFeatures(data.astype('double').T)
92
93 if __debug__:
94 debug("SG__", "Done converting data for shogun into RealFeatures")
95 _setdebug(features, 'Features')
96 return features
97
98
100 """Support Vector Machine Classifier(s) based on Shogun
101
102 This is a simple base interface
103 """
104
105 num_threads = Parameter(1,
106 min=1,
107 doc='Number of threads to utilize')
108
109
110 _KERNELS = { "linear": (shogun.Kernel.LinearKernel, ('scale',), LinearSVMWeights),
111 "rbf" : (shogun.Kernel.GaussianKernel, ('gamma',), None),
112 "rbfshift" : (shogun.Kernel.GaussianShiftKernel, ('gamma', 'max_shift', 'shift_step'), None),
113 "sigmoid" : (shogun.Kernel.SigmoidKernel, ('cache_size', 'gamma', 'coef0'), None),
114 }
115
116 _KNOWN_PARAMS = [ 'epsilon' ]
117 _KNOWN_KERNEL_PARAMS = [ ]
118
119 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
120
121 if externals.exists('sg >= 0.6.4'):
122 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights)
123
124
125
126 """
127 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
128 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
129 with standard additive bias, but will L2 reqularize it - though it
130 should not matter much in practice (although it will give slightly
131 different solutions)). Note that SGD has no stopping criterion (you
132 simply have to specify the number of iterations) and that OCAS has a
133 different stopping condition than svmlight for example which may be more
134 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
135 for epsilon.
136
137 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
138 depending on the problem one is faster than the other (hard to say when,
139 I *think* when your dataset is very unbalanced chunking methods like
140 svmlight/gpdt are better), for smaller problems definitely libsvm.
141
142 If you use string kernels then gpdt/svmlight have a special 'linadd'
143 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
144 code for python-modular before that). This is effective for big datasets
145 and (I trained on 10 million strings based on this).
146
147 And yes currently we only implemented parallel training for svmlight,
148 however all SVMs can be evaluated in parallel.
149 """
150 _KNOWN_IMPLEMENTATIONS = {
151 "libsvm" : (shogun.Classifier.LibSVM, ('C',), ('multiclass', 'binary'),
152 "LIBSVM's C-SVM (L2 soft-margin SVM)"),
153 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',), ('multiclass', 'binary'),
154 "Generalized Nearest Point Problem SVM"),
155
156 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',),
157 "Gradient Projection Decomposition Technique for large-scale SVM problems"),
158 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',),
159 "Generalized Nearest Point Problem SVM"),
160
161
162
163
164
165
166
167
168
169
170 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',), ('regression',),
171 "LIBSVM's epsilon-SVR"),
172 "krr": (shogun.Regression.KRR, ('tau',), ('regression',),
173 "Kernel Ridge Regression"),
174 }
175
176
177 - def __init__(self,
178 kernel_type='linear',
179 **kwargs):
180 """Interface class to Shogun's classifiers and regressions.
181
182 Default implementation is 'libsvm'.
183 """
184
185 svm_impl = kwargs.get('svm_impl', 'libsvm').lower()
186 kwargs['svm_impl'] = svm_impl
187
188
189 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
190
191 self.__svm = None
192 """Holds the trained svm."""
193
194
195
196
197 self.__traindataset = None
198
199
200 self.__traindata = None
201 self.__kernel = None
202 self.__kernel_test = None
203 self.__testdata = None
204
205
207
208
209
210 if self._svm_impl in ['svrlight', 'lightsvm']:
211 kernel.set_precompute_matrix(True, True)
212
213
215 """Train SVM
216 """
217
218
219 newkernel, newsvm = False, False
220
221 retrainable = self.params.retrainable
222
223 if retrainable:
224 _changedData = self._changedData
225
226
227 ul = None
228 self.__traindataset = dataset
229
230
231
232
233
234
235 if __debug__:
236 debug("SG_", "Creating labels instance")
237
238 if 'regression' in self._clf_internals:
239 labels_ = N.asarray(dataset.labels, dtype='double')
240 else:
241 ul = dataset.uniquelabels
242 ul.sort()
243
244 if len(ul) == 2:
245
246 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0}
247 elif len(ul) < 2:
248 raise ValueError, "we do not have 1-class SVM brought into SG yet"
249 else:
250
251 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
252
253
254 _labels_dict_rev = dict([(x[1], x[0])
255 for x in _labels_dict.items()])
256
257
258 self._labels_dict = _labels_dict
259 self._labels_dict_rev = _labels_dict_rev
260
261
262
263
264
265 if __debug__:
266 debug("SG__", "Mapping labels using dict %s" % _labels_dict)
267 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double')
268
269 labels = shogun.Features.Labels(labels_)
270 _setdebug(labels, 'Labels')
271
272
273
274 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']:
275
276
277 kargs = []
278 for arg in self._KERNELS[self._kernel_type_literal][1]:
279 value = self.kernel_params[arg].value
280
281 if arg == 'gamma' and value == 0.0:
282 value = self._getDefaultGamma(dataset)
283 kargs += [value]
284
285 if retrainable and __debug__:
286 if _changedData['traindata']:
287 debug("SG",
288 "Re-Creating kernel since training data has changed")
289
290 if _changedData['kernel_params']:
291 debug("SG",
292 "Re-Creating kernel since params %s has changed" %
293 _changedData['kernel_params'])
294
295
296 if __debug__: debug("SG_", "Converting input data for shogun")
297 self.__traindata = _tosg(dataset.samples)
298
299 if __debug__:
300 debug("SG", "Creating kernel instance of %s giving arguments %s" %
301 (`self._kernel_type`, kargs))
302
303 self.__kernel = kernel = \
304 self._kernel_type(self.__traindata, self.__traindata,
305 *kargs)
306
307 if externals.exists('sg >= 0.6.4'):
308 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer())
309
310 newkernel = True
311 self.kernel_params.reset()
312 _setdebug(kernel, 'Kernels')
313
314 self.__condition_kernel(kernel)
315 if retrainable:
316 if __debug__:
317 debug("SG_", "Resetting test kernel for retrainable SVM")
318 self.__kernel_test = None
319 self.__kernel_args = kargs
320
321
322
323 Cs = None
324 if not retrainable or self.__svm is None or _changedData['params']:
325
326 if self.params.isKnown('C'):
327 C = self.params.C
328 if not operator.isSequenceType(C):
329
330 C = [C]
331
332 Cs = list(C[:])
333 for i in xrange(len(Cs)):
334 if Cs[i]<0:
335 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i])
336 if __debug__:
337 debug("SG_", "Default C for %s was computed to be %s" %
338 (C[i], Cs[i]))
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360 svm_impl_class = self.__get_implementation(ul)
361
362 if __debug__:
363 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
364
365 if self._svm_impl in ['libsvr', 'svrlight']:
366
367 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels)
368 elif self._svm_impl in ['krr']:
369 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
370 else:
371 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
372 self.__svm.set_epsilon(self.params.epsilon)
373 if Cs is not None and len(Cs) == 2:
374 if __debug__:
375 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs)
376 self.__svm.set_C(Cs[0], Cs[1])
377
378 self.params.reset()
379 newsvm = True
380 _setdebug(self.__svm, 'SVM')
381
382 if self.params.isKnown('tube_epsilon') and \
383 hasattr(self.__svm, 'set_tube_epsilon'):
384 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
385 self.__svm.parallel.set_num_threads(self.params.num_threads)
386 else:
387 if __debug__:
388 debug("SG_", "SVM instance is not re-created")
389 if _changedData['labels']:
390 if __debug__: debug("SG__", "Assigning new labels")
391 self.__svm.set_labels(labels)
392 if newkernel:
393 if __debug__: debug("SG__", "Assigning new kernel")
394 self.__svm.set_kernel(self.__kernel)
395 assert(_changedData['params'] is False)
396
397 if retrainable:
398
399 self.states.retrained = not newsvm or not newkernel
400
401
402 if __debug__ and 'SG' in debug.active:
403 if not self.regression:
404 lstr = " with labels %s" % dataset.uniquelabels
405 else:
406 lstr = ""
407 debug("SG", "%sTraining %s on data%s" %
408 (("","Re-")[retrainable and self.states.retrained],
409 self, lstr))
410
411 self.__svm.train()
412
413 if __debug__:
414 debug("SG_", "Done training SG_SVM %s" % self._kernel_type)
415
416
417 if (__debug__ and 'SG__' in debug.active) or \
418 self.states.isEnabled('training_confusion'):
419 trained_labels = self.__svm.classify().get_labels()
420 else:
421 trained_labels = None
422
423 if __debug__ and "SG__" in debug.active:
424 debug("SG__", "Original labels: %s, Trained labels: %s" %
425 (dataset.labels, trained_labels))
426
427
428
429
430
431
432
433
434
435 if self.regression and self.states.isEnabled('training_confusion'):
436 self.states.training_confusion = self._summaryClass(
437 targets=dataset.labels,
438 predictions=trained_labels)
439
441 """Predict values for the data
442 """
443
444 retrainable = self.params.retrainable
445
446 if retrainable:
447 changed_testdata = self._changedData['testdata'] or \
448 self.__kernel_test is None
449
450 if not retrainable or changed_testdata:
451 testdata = _tosg(data)
452
453 if not retrainable:
454 if __debug__:
455 debug("SG__",
456 "Initializing SVMs kernel of %s with training/testing samples"
457 % self)
458
459 self.__kernel.init(self.__traindata, testdata)
460 self.__condition_kernel(self.__kernel)
461 else:
462 if changed_testdata:
463 if __debug__:
464 debug("SG__",
465 "Re-creating testing kernel of %s giving "
466 "arguments %s" %
467 (`self._kernel_type`, self.__kernel_args))
468 kernel_test = self._kernel_type(self.__traindata, testdata,
469 *self.__kernel_args)
470 _setdebug(kernel_test, 'Kernels')
471
472 custk_args = ([self.__traindata, testdata], [])[
473 int(externals.exists('sg >= 0.6.4'))]
474 if __debug__:
475 debug("SG__",
476 "Re-creating custom testing kernel giving "
477 "arguments %s" % (str(custk_args)))
478 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args)
479
480 _setdebug(kernel_test_custom, 'Kernels')
481 self.__kernel_test = kernel_test_custom
482 self.__kernel_test.set_full_kernel_matrix_from_full(
483 kernel_test.get_kernel_matrix())
484 elif __debug__:
485 debug("SG__", "Re-using testing kernel")
486
487 assert(self.__kernel_test is not None)
488 self.__svm.set_kernel(self.__kernel_test)
489
490 if __debug__:
491 debug("SG_", "Classifying testing data")
492
493
494
495 values_ = self.__svm.classify()
496 if values_ is None:
497 raise RuntimeError, "We got empty list of values from %s" % self
498
499 values = values_.get_labels()
500
501 if retrainable:
502
503 self.states.repredicted = repredicted = not changed_testdata
504 if __debug__:
505 debug("SG__", "Re-assigning learing kernel. Repredicted is %s"
506 % repredicted)
507
508 self.__svm.set_kernel(self.__kernel)
509
510 if __debug__:
511 debug("SG__", "Got values %s" % values)
512
513 if ('regression' in self._clf_internals):
514 predictions = values
515 else:
516
517 _labels_dict = self._labels_dict
518 _labels_dict_rev = self._labels_dict_rev
519
520 if len(_labels_dict) == 2:
521 predictions = 1.0 - 2*N.signbit(values)
522 else:
523 predictions = values
524
525
526 label_type = type(_labels_dict.values()[0])
527
528
529 predictions = [_labels_dict_rev[label_type(x)]
530 for x in predictions]
531
532 if __debug__:
533 debug("SG__", "Tuned predictions %s" % predictions)
534
535
536
537
538 self.values = values
539
540
541 if not retrainable:
542 try:
543 testdata.free_features()
544 except:
545 pass
546
547 return predictions
548
549
551 super(SVM, self).untrain()
552 if not self.params.retrainable:
553 if __debug__:
554 debug("SG__", "Untraining %(clf)s and destroying sg's SVM",
555 msgargs={'clf':self})
556
557
558
559 if True:
560 if True:
561
562 if self.__kernel is not None:
563 del self.__kernel
564 self.__kernel = None
565
566 if self.__kernel_test is not None:
567 del self.__kernel_test
568 self.__kernel_test = None
569
570 if self.__svm is not None:
571 del self.__svm
572 self.__svm = None
573
574 if self.__traindata is not None:
575
576
577
578
579 self.__traindata.free_features()
580 del self.__traindata
581 self.__traindata = None
582
583 self.__traindataset = None
584
585
586
587
588
589 if __debug__:
590 debug("SG__",
591 "Done untraining %(self)s and destroying sg's SVM",
592 msgargs=locals())
593 elif __debug__:
594 debug("SG__", "Not untraining %(self)s since it is retrainable",
595 msgargs=locals())
596
597
599 if 'regression' in self._clf_internals or len(ul) == 2:
600 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0]
601 else:
602 if self._svm_impl == 'libsvm':
603 svm_impl_class = shogun.Classifier.LibSVMMultiClass
604 elif self._svm_impl == 'gmnp':
605 svm_impl_class = shogun.Classifier.GMNPSVM
606 else:
607 raise RuntimeError, \
608 "Shogun: Implementation %s doesn't handle multiclass " \
609 "data. Got labels %s. Use some other classifier" % \
610 (self._svm_impl, self.__traindataset.uniquelabels)
611 if __debug__:
612 debug("SG_", "Using %s for multiclass data of %s" %
613 (svm_impl_class, self._svm_impl))
614
615 return svm_impl_class
616
617
618 svm = property(fget=lambda self: self.__svm)
619 """Access to the SVM model."""
620
621 traindataset = property(fget=lambda self: self.__traindataset)
622 """Dataset which was used for training
623
624 TODO -- might better become state variable I guess"""
625
626
627
628
629
630 for name, item, params, descr in \
631 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)",
632 "MPD classifier from shogun"),
633 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)",
634 "SVMLight classification http://svmlight.joachims.org/"),
635 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)",
636 "SVMLight regression http://svmlight.joachims.org/")]:
637 if externals.exists('shogun.%s' % name):
638 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr)
639
640
641 LinearSVMWeights._LEGAL_CLFS = [SVM]
642