1
2
3
4
5
6
7
8
9 """Collection of dataset splitters.
10
11 Module Description
12 ==================
13
14 Splitters are destined to split the provided dataset varous ways to
15 simplify cross-validation analysis, implement boosting of the
16 estimates, or sample null-space via permutation testing.
17
18 Most of the splitters at the moment split 2-ways -- conventionally
19 first part is used for training, and 2nd part for testing by
20 `CrossValidatedTransferError` and `SplitClassifier`.
21
22 Brief Description of Available Splitters
23 ========================================
24
25 * `NoneSplitter` - just return full dataset as the desired part (training/testing)
26 * `OddEvenSplitter` - 2 splits: (odd samples,even samples) and (even, odd)
27 * `HalfSplitter` - 2 splits: (first half, second half) and (second, first)
28 * `NFoldSplitter` - splits for N-Fold cross validation.
29
30 Module Organization
31 ===================
32
33 .. packagetree::
34 :style: UML
35
36 """
37
38 __docformat__ = 'restructuredtext'
39
40 import operator
41
42 import numpy as N
43
44 import mvpa.misc.support as support
45 from mvpa.base.dochelpers import enhancedDocString
46
47 if __debug__:
48 from mvpa.base import debug
49
51 """Base class of dataset splitters.
52
53 Each splitter should be initialized with all its necessary parameters. The
54 final splitting is done running the splitter object on a certain Dataset
55 via __call__(). This method has to be implemented like a generator, i.e. it
56 has to return every possible split with a yield() call.
57
58 Each split has to be returned as a sequence of Datasets. The properties
59 of the splitted dataset may vary between implementations. It is possible
60 to declare a sequence element as 'None'.
61
62 Please note, that even if there is only one Dataset returned it has to be
63 an element in a sequence and not just the Dataset object!
64 """
65
66 _STRATEGIES = ('first', 'random', 'equidistant')
67 _NPERLABEL_STR = ['equal', 'all']
68
69 - def __init__(self,
70 nperlabel='all',
71 nrunspersplit=1,
72 permute=False,
73 count=None,
74 strategy='equidistant',
75 attr='chunks'):
76 """Initialize splitter base.
77
78 :Parameters:
79 nperlabel : int or str (or list of them) or float
80 Number of dataset samples per label to be included in each
81 split. If given as a float, it must be in [0,1] range and would
82 mean the ratio of selected samples per each label.
83 Two special strings are recognized: 'all' uses all available
84 samples (default) and 'equal' uses the maximum number of samples
85 the can be provided by all of the classes. This value might be
86 provided as a sequence whos length matches the number of datasets
87 per split and indicates the configuration for the respective dataset
88 in each split.
89 nrunspersplit: int
90 Number of times samples for each split are chosen. This
91 is mostly useful if a subset of the available samples
92 is used in each split and the subset is randomly
93 selected for each run (see the `nperlabel` argument).
94 permute : bool
95 If set to `True`, the labels of each generated dataset
96 will be permuted on a per-chunk basis.
97 count : None or int
98 Desired number of splits to be output. It is limited by the
99 number of splits possible for a given splitter
100 (e.g. `OddEvenSplitter` can have only up to 2 splits). If None,
101 all splits are output (default).
102 strategy : str
103 If `count` is not None, possible strategies are possible:
104 first
105 First `count` splits are chosen
106 random
107 Random (without replacement) `count` splits are chosen
108 equidistant
109 Splits which are equidistant from each other
110 attr : str
111 Sample attribute used to determine splits.
112 """
113
114 self.__nperlabel = None
115 self.__runspersplit = nrunspersplit
116 self.__permute = permute
117 self.__splitattr = attr
118
119
120
121
122 self.count = count
123 """Number (max) of splits to output on call"""
124
125 self._setStrategy(strategy)
126
127
128 self.setNPerLabel(nperlabel)
129
130
131 __doc__ = enhancedDocString('Splitter', locals())
132
141
143 """Set the number of samples per label in the split datasets.
144
145 'equal' sets sample size to highest possible number of samples that
146 can be provided by each class. 'all' uses all available samples
147 (default).
148 """
149 if isinstance(value, basestring):
150 if not value in self._NPERLABEL_STR:
151 raise ValueError, "Unsupported value '%s' for nperlabel." \
152 " Supported ones are %s or float or int" % (value, self._NPERLABEL_STR)
153 self.__nperlabel = value
154
155
157 """Each subclass has to implement this method. It gets a sequence with
158 the unique attribte ids of a dataset and has to return a list of lists
159 containing attribute ids to split into the second dataset.
160 """
161 raise NotImplementedError
162
163
165 """Splits the dataset.
166
167 This method behaves like a generator.
168 """
169
170
171 ds_class = dataset.__class__
172 DS_permuteLabels = ds_class.permuteLabels
173 try:
174 DS_getNSamplesPerLabel = ds_class._getNSamplesPerAttr
175 except AttributeError:
176
177
178 pass
179 DS_getRandomSamples = ds_class.getRandomSamples
180
181
182 cfgs = self.splitcfg(dataset)
183
184
185 count, Ncfgs = self.count, len(cfgs)
186
187
188 if count is not None and count < Ncfgs:
189 if count < 1:
190
191 return
192 strategy = self.strategy
193 if strategy == 'first':
194 cfgs = cfgs[:count]
195 elif strategy in ['equidistant', 'random']:
196 if strategy == 'equidistant':
197
198
199 step = float(Ncfgs) / count
200 assert(step >= 1.0)
201 indexes = [int(round(step * i)) for i in xrange(count)]
202 elif strategy == 'random':
203 indexes = N.random.permutation(range(Ncfgs))[:count]
204
205
206 indexes.sort()
207 else:
208
209 raise RuntimeError, "Really should not happen"
210 if __debug__:
211 debug("SPL", "For %s strategy selected %s splits "
212 "from %d total" % (strategy, indexes, Ncfgs))
213 cfgs = [cfgs[i] for i in indexes]
214
215 for split in cfgs:
216
217 if not operator.isSequenceType(self.__nperlabel) \
218 or isinstance(self.__nperlabel, str):
219 nperlabelsplit = [self.__nperlabel] * len(split)
220 else:
221 nperlabelsplit = self.__nperlabel
222
223
224 split_ds = self.splitDataset(dataset, split)
225
226
227 for run in xrange(self.__runspersplit):
228
229
230 finalized_datasets = []
231
232 for ds, nperlabel in zip(split_ds, nperlabelsplit):
233
234 if self.__permute:
235 DS_permuteLabels(ds, True, perchunk=True)
236
237
238 if nperlabel == 'all' or ds is None:
239 finalized_datasets.append(ds)
240 else:
241
242
243
244
245
246 if nperlabel == 'equal':
247
248 npl = N.array(DS_getNSamplesPerLabel(
249 ds, attrib='labels').values()).min()
250 elif isinstance(nperlabel, float) or (
251 operator.isSequenceType(nperlabel) and
252 len(nperlabel) > 0 and
253 isinstance(nperlabel[0], float)):
254
255
256 counts = N.array(DS_getNSamplesPerLabel(
257 ds, attrib='labels').values())
258 npl = (counts * nperlabel).round().astype(int)
259 else:
260 npl = nperlabel
261
262
263 finalized_datasets.append(
264 DS_getRandomSamples(ds, npl))
265
266 yield finalized_datasets
267
268
270 """Split a dataset by separating the samples where the configured
271 sample attribute matches an element of `specs`.
272
273 :Parameters:
274 dataset : Dataset
275 This is this source dataset.
276 specs : sequence of sequences
277 Contains ids of a sample attribute that shall be split into the
278 another dataset.
279
280 :Returns: Tuple of splitted datasets.
281 """
282
283 filters = []
284 none_specs = 0
285 cum_filter = None
286
287 splitattr_data = eval('dataset.' + self.__splitattr)
288 for spec in specs:
289 if spec == None:
290 filters.append(None)
291 none_specs += 1
292 else:
293 filter_ = N.array([ i in spec \
294 for i in splitattr_data])
295 filters.append(filter_)
296 if cum_filter == None:
297 cum_filter = filter_
298 else:
299 cum_filter = N.logical_and(cum_filter, filter_)
300
301
302 if none_specs > 1:
303 raise ValueError, "Splitter cannot handle more than one `None` " \
304 "split definition."
305
306 for i, filter_ in enumerate(filters):
307 if filter_ == None:
308 filters[i] = N.logical_not(cum_filter)
309
310
311
312
313 split_datasets = []
314
315
316 dataset_selectSamples = dataset.selectSamples
317 for filter_ in filters:
318 if (filter_ == False).all():
319 split_datasets.append(None)
320 else:
321 split_datasets.append(dataset_selectSamples(filter_))
322
323 return split_datasets
324
325
327 """String summary over the object
328 """
329 return \
330 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \
331 % (self.__nperlabel, self.__runspersplit, self.__permute)
332
333
335 """Return splitcfg for a given dataset"""
336 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))
337
338
339 strategy = property(fget=lambda self:self.__strategy,
340 fset=_setStrategy)
341
342
344 """This is a dataset splitter that does **not** split. It simply returns
345 the full dataset that it is called with.
346
347 The passed dataset is returned as the second element of the 2-tuple.
348 The first element of that tuple will always be 'None'.
349 """
350
351 _known_modes = ['first', 'second']
352
353 - def __init__(self, mode='second', **kwargs):
354 """Cheap init -- nothing special
355
356 :Parameters:
357 mode
358 Either 'first' or 'second' (default) -- which output dataset
359 would actually contain the samples
360 """
361 Splitter.__init__(self, **(kwargs))
362
363 if not mode in NoneSplitter._known_modes:
364 raise ValueError, "Unknown mode %s for NoneSplitter" % mode
365 self.__mode = mode
366
367
368 __doc__ = enhancedDocString('NoneSplitter', locals(), Splitter)
369
370
372 """Return just one full split: no first or second dataset.
373 """
374 if self.__mode == 'second':
375 return [([], None)]
376 else:
377 return [(None, [])]
378
379
381 """String summary over the object
382 """
383 return \
384 "NoneSplitter / " + Splitter.__str__(self)
385
386
387
389 """Split a dataset into odd and even values of the sample attribute.
390
391 The splitter yields to splits: first (odd, even) and second (even, odd).
392 """
393 - def __init__(self, usevalues=False, **kwargs):
394 """Cheap init.
395
396 :Parameters:
397 usevalues: Boolean
398 If True the values of the attribute used for splitting will be
399 used to determine odd and even samples. If False odd and even
400 chunks are defined by the order of attribute values, i.e. first
401 unique attribute is odd, second is even, despite the
402 corresponding values might indicate the opposite (e.g. in case
403 of [2,3].
404 """
405 Splitter.__init__(self, **(kwargs))
406
407 self.__usevalues = usevalues
408
409
410 __doc__ = enhancedDocString('OddEvenSplitter', locals(), Splitter)
411
412
414 """Huka chaka!
415 YOH: LOL XXX
416 """
417 if self.__usevalues:
418 return [(None, uniqueattrs[(uniqueattrs % 2) == True]),
419 (None, uniqueattrs[(uniqueattrs % 2) == False])]
420 else:
421 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]),
422 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]
423
424
426 """String summary over the object
427 """
428 return \
429 "OddEvenSplitter / " + Splitter.__str__(self)
430
431
432
434 """Split a dataset into two halves of the sample attribute.
435
436 The splitter yields to splits: first (1st half, 2nd half) and second
437 (2nd half, 1st half).
438 """
443
444
445 __doc__ = enhancedDocString('HalfSplitter', locals(), Splitter)
446
447
449 """Huka chaka!
450 """
451 return [(None, uniqueattrs[:len(uniqueattrs)/2]),
452 (None, uniqueattrs[len(uniqueattrs)/2:])]
453
454
456 """String summary over the object
457 """
458 return \
459 "HalfSplitter / " + Splitter.__str__(self)
460
461
462
464 """Generic N-fold data splitter.
465
466 Provide folding splitting. Given a dataset with N chunks, with
467 cvtype=1 (which is default), it would generate N splits, where
468 each chunk sequentially is taken out (with replacement) for
469 cross-validation. Example, if there is 4 chunks, splits for
470 cvtype=1 are:
471
472 [[1, 2, 3], [0]]
473 [[0, 2, 3], [1]]
474 [[0, 1, 3], [2]]
475 [[0, 1, 2], [3]]
476
477 If cvtype>1, then all possible combinations of cvtype number of
478 chunks are taken out for testing, so for cvtype=2 in previous
479 example:
480
481 [[2, 3], [0, 1]]
482 [[1, 3], [0, 2]]
483 [[1, 2], [0, 3]]
484 [[0, 3], [1, 2]]
485 [[0, 2], [1, 3]]
486 [[0, 1], [2, 3]]
487
488 """
489
490 - def __init__(self,
491 cvtype = 1,
492 **kwargs):
493 """Initialize the N-fold splitter.
494
495 :Parameter:
496 cvtype: Int
497 Type of cross-validation: N-(cvtype)
498 kwargs
499 Additional parameters are passed to the `Splitter` base class.
500 """
501 Splitter.__init__(self, **(kwargs))
502
503
504 self.__cvtype = cvtype
505
506
507 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter)
508
509
511 """String summary over the object
512 """
513 return \
514 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)
515
516
518 """Returns proper split configuration for N-M fold split.
519 """
520 return [(None, i) for i in \
521 support.getUniqueLengthNCombinations(uniqueattrs,
522 self.__cvtype)]
523
524
525
527 """Split a dataset using an arbitrary custom rule.
528
529 The splitter is configured by passing a custom spitting rule (`splitrule`)
530 to its constructor. Such a rule is basically a sequence of split
531 definitions. Every single element in this sequence results in excatly one
532 split generated by the Splitter. Each element is another sequence for
533 sequences of sample ids for each dataset that shall be generated in the
534 split.
535
536 Example:
537
538 * Generate two splits. In the first split the *second* dataset
539 contains all samples with sample attributes corresponding to
540 either 0, 1 or 2. The *first* dataset of the first split contains
541 all samples which are not split into the second dataset.
542
543 The second split yields three datasets. The first with all samples
544 corresponding to sample attributes 1 and 2, the second dataset
545 contains only samples with attrbiute 3 and the last dataset
546 contains the samples with attribute 5 and 6.
547
548 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])])
549 """
550 - def __init__(self, splitrule, **kwargs):
551 """Cheap init.
552 """
553 Splitter.__init__(self, **(kwargs))
554
555 self.__splitrule = splitrule
556
557
558 __doc__ = enhancedDocString('CustomSplitter', locals(), Splitter)
559
560
562 """Huka chaka!
563 """
564 return self.__splitrule
565
566
568 """String summary over the object
569 """
570 return "CustomSplitter / " + Splitter.__str__(self)
571