1
2
3
4
5
6
7
8
9 """Estimator for classifier error distributions."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from mvpa.base import externals, warning
16 from mvpa.misc.state import Stateful, StateVariable
17
18 if __debug__:
19 from mvpa.base import debug
20
21
23 """Non-parametric 1d distribution -- derives cdf based on stored values.
24
25 Introduced to complement parametric distributions present in scipy.stats.
26 """
27
30
31
32 @staticmethod
33 - def fit(dist_samples):
35
36
38 """Returns the cdf value at `x`.
39 """
40 return N.vectorize(lambda v:(self._dist_samples <= v).mean())(x)
41
42
43 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
44 """Helper function to return p-value(x) given cdf and tail
45
46 :Parameters:
47 cdf_func : callable
48 Function to be used to derive cdf values for x
49 tail : str ('left', 'right', 'any', 'both')
50 Which tail of the distribution to report. For 'any' and 'both'
51 it chooses the tail it belongs to based on the comparison to
52 p=0.5. In the case of 'any' significance is taken like in a
53 one-tailed test.
54 return_tails : bool
55 If True, a tuple return (pvalues, tails), where tails contain
56 1s if value was from the right tail, and 0 if the value was
57 from the left tail.
58 """
59 is_scalar = N.isscalar(x)
60 if is_scalar:
61 x = [x]
62
63 cdf = cdf_func(x)
64
65 if __debug__ and 'CHECK_STABILITY' in debug.active:
66 cdf_min, cdf_max = N.min(cdf), N.max(cdf)
67 if cdf_min < 0 or cdf_max > 1.0:
68 s = ('', ' for %s' % name)[int(name is not None)]
69 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \
70 (cdf_func, s, cdf_min, cdf_max))
71
72
73
74 cdf = N.clip(cdf, 0, 1.0)
75
76 if tail == 'left':
77 if return_tails:
78 right_tail = N.zeros(cdf.shape, dtype=bool)
79 elif tail == 'right':
80 cdf = 1 - cdf
81 if return_tails:
82 right_tail = N.ones(cdf.shape, dtype=bool)
83 elif tail in ('any', 'both'):
84 right_tail = (cdf >= 0.5)
85 cdf[right_tail] = 1.0 - cdf[right_tail]
86 if tail == 'both':
87
88 cdf *= 2
89
90
91 cdf[N.isnan(x)] = 1.0
92 if is_scalar: res = cdf[0]
93 else: res = cdf
94
95 if return_tails:
96 return (res, right_tail)
97 else:
98 return res
99
100
102 """Base class for null-hypothesis testing.
103
104 """
105
106
107
108
109
110 _ATTRIBUTE_COLLECTIONS = ['states']
111
112 - def __init__(self, tail='both', **kwargs):
113 """Cheap initialization.
114
115 :Parameter:
116 tail: str ('left', 'right', 'any', 'both')
117 Which tail of the distribution to report. For 'any' and 'both'
118 it chooses the tail it belongs to based on the comparison to
119 p=0.5. In the case of 'any' significance is taken like in a
120 one-tailed test.
121 """
122 Stateful.__init__(self, **kwargs)
123
124 self._setTail(tail)
125
126
128
129 if tail not in ('left', 'right', 'any', 'both'):
130 raise ValueError, 'Unknown value "%s" to `tail` argument.' \
131 % tail
132 self.__tail = tail
133
134
135 - def fit(self, measure, wdata, vdata=None):
136 """Implement to fit the distribution to the data."""
137 raise NotImplementedError
138
139
141 """Implementations return the value of the cumulative distribution
142 function (left or right tail dpending on the setting).
143 """
144 raise NotImplementedError
145
146
147 - def p(self, x, **kwargs):
148 """Returns the p-value for values of `x`.
149 Returned values are determined left, right, or from any tail
150 depending on the constructor setting.
151
152 In case a `FeaturewiseDatasetMeasure` was used to estimate the
153 distribution the method returns an array. In that case `x` can be
154 a scalar value or an array of a matching shape.
155 """
156 return _pvalue(x, self.cdf, self.__tail, **kwargs)
157
158
159 tail = property(fget=lambda x:x.__tail, fset=_setTail)
160
161
163 """Null-hypothesis distribution is estimated from randomly permuted data labels.
164
165 The distribution is estimated by calling fit() with an appropriate
166 `DatasetMeasure` or `TransferError` instance and a training and a
167 validation dataset (in case of a `TransferError`). For a customizable
168 amount of cycles the training data labels are permuted and the
169 corresponding measure computed. In case of a `TransferError` this is the
170 error when predicting the *correct* labels of the validation dataset.
171
172 The distribution can be queried using the `cdf()` method, which can be
173 configured to report probabilities/frequencies from `left` or `right` tail,
174 i.e. fraction of the distribution that is lower or larger than some
175 critical value.
176
177 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
178 returns an array of featurewise probabilities/frequencies.
179 """
180
181 _DEV_DOC = """
182 TODO automagically decide on the number of samples/permutations needed
183 Caution should be paid though since resultant distributions might be
184 quite far from some conventional ones (e.g. Normal) -- it is expected to
185 them to be bimodal (or actually multimodal) in many scenarios.
186 """
187
188 dist_samples = StateVariable(enabled=False,
189 doc='Samples obtained for each permutation')
190
192 """Initialize Monte-Carlo Permutation Null-hypothesis testing
193
194 :Parameter:
195 dist_class: class
196 This can be any class which provides parameters estimate
197 using `fit()` method to initialize the instance, and
198 provides `cdf(x)` method for estimating value of x in CDF.
199 All distributions from SciPy's 'stats' module can be used.
200 permutations: int
201 This many permutations of label will be performed to
202 determine the distribution under the null hypothesis.
203 """
204 NullDist.__init__(self, **kwargs)
205
206 self._dist_class = dist_class
207 self._dist = []
208
209 self.__permutations = permutations
210 """Number of permutations to compute the estimate the null
211 distribution."""
212
213
214
215 - def fit(self, measure, wdata, vdata=None):
216 """Fit the distribution by performing multiple cycles which repeatedly
217 permuted labels in the training dataset.
218
219 :Parameter:
220 measure: (`Featurewise`)`DatasetMeasure` | `TransferError`
221 TransferError instance used to compute all errors.
222 wdata: `Dataset` which gets permuted and used to compute the
223 measure/transfer error multiple times.
224 vdata: `Dataset` used for validation.
225 If provided measure is assumed to be a `TransferError` and
226 working and validation dataset are passed onto it.
227 """
228 dist_samples = []
229 """Holds the values for randomized labels."""
230
231
232 if not vdata is None:
233 measure_args = [vdata, wdata]
234 else:
235 measure_args = [wdata]
236
237
238 for p in xrange(self.__permutations):
239
240
241
242
243
244
245
246
247 wdata.permuteLabels(True, perchunk=False)
248
249
250
251 dist_samples.append(measure(*measure_args))
252
253
254 wdata.permuteLabels(False, perchunk=False)
255
256
257 self.dist_samples = dist_samples = N.asarray(dist_samples)
258
259
260
261
262 shape = dist_samples.shape
263 nshape = len(shape)
264
265
266 if nshape == 1:
267 dist_samples = dist_samples[:, N.newaxis]
268
269
270
271 dist_samples_rs = dist_samples.reshape((shape[0], -1))
272 dist = []
273 for samples in dist_samples_rs.T:
274 params = self._dist_class.fit(samples)
275 if __debug__ and 'STAT' in debug.active:
276 debug('STAT', 'Estimated parameters for the %s are %s'
277 % (self._dist_class, str(params)))
278 dist.append(self._dist_class(*params))
279 self._dist = dist
280
281
283 """Return value of the cumulative distribution function at `x`.
284 """
285 if self._dist is None:
286
287
288
289 raise RuntimeError, "Distribution has to be fit first"
290
291 is_scalar = N.isscalar(x)
292 if is_scalar:
293 x = [x]
294
295 x = N.asanyarray(x)
296 xshape = x.shape
297
298 x = x.reshape((-1,))
299
300 if len(self._dist) != len(x):
301 raise ValueError, 'Distribution was fit for structure with %d' \
302 ' elements, whenever now queried with %d elements' \
303 % (len(self._dist), len(x))
304
305
306 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ]
307 return N.array(cdfs).reshape(xshape)
308
309
311 """Clean stored distributions
312
313 Storing all of the distributions might be too expensive
314 (e.g. in case of Nonparametric), and the scope of the object
315 might be too broad to wait for it to be destroyed. Clean would
316 bind dist_samples to empty list to let gc revoke the memory.
317 """
318 self._dist = []
319
320
321
323 """Proxy/Adaptor class for SciPy distributions.
324
325 All distributions from SciPy's 'stats' module can be used with this class.
326
327 >>> import numpy as N
328 >>> from scipy import stats
329 >>> from mvpa.clfs.stats import FixedNullDist
330 >>>
331 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4))
332 >>> dist.p(2)
333 0.5
334 >>>
335 >>> dist.cdf(N.arange(5))
336 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246])
337 >>>
338 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right')
339 >>> dist.p(N.arange(5))
340 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754])
341 """
343 """
344 :Parameter:
345 dist: distribution object
346 This can be any object the has a `cdf()` method to report the
347 cumulative distribition function values.
348 """
349 NullDist.__init__(self, **kwargs)
350
351 self._dist = dist
352
353
354 - def fit(self, measure, wdata, vdata=None):
355 """Does nothing since the distribution is already fixed."""
356 pass
357
358
360 """Return value of the cumulative distribution function at `x`.
361 """
362 return self._dist.cdf(x)
363
364
366 """Adaptive distribution which adjusts parameters according to the data
367
368 WiP: internal implementation might change
369 """
370 - def fit(self, measure, wdata, vdata=None):
371 """Cares about dimensionality of the feature space in measure
372 """
373
374 try:
375 nfeatures = len(measure.feature_ids)
376 except ValueError:
377 nfeatures = N.prod(wdata.shape[1:])
378
379 dist_gen = self._dist
380 if not hasattr(dist_gen, 'fit'):
381 dist_gen = dist_gen.dist
382
383 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata)
384 if __debug__:
385 debug('STAT', 'Adapted parameters for %s to be %s, %s'
386 % (dist_gen, args, kwargs))
387 self._dist = dist_gen(*args, **kwargs)
388
389
390 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
391 raise NotImplementedError
392
393
395 """Adaptive rdist: params are (nfeatures-1, 0, 1)
396 """
397
398 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
400
401
402
403
404
406 cdf_ = self._dist.cdf(x)
407 bad_values = N.where(N.abs(cdf_)>1)
408
409
410
411 if len(bad_values[0]):
412
413
414 cdf_bad = cdf_[bad_values]
415 x_bad = x[bad_values]
416 cdf_bad[x_bad<0] = 0.0
417 cdf_bad[x_bad>=0] = 1.0
418 cdf_[bad_values] = cdf_bad
419 return cdf_
420
421
423 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures))
424 """
425
426 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
428
429
430 if externals.exists('scipy'):
431 from mvpa.support.stats import scipy
432 from scipy.stats import kstest
433 """
434 Thoughts:
435
436 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to
437 check/reject H0 that samples come from a given distribution. But
438 since it is based on a full range of data, we might better of with
439 some ad-hoc checking by the detection power of the values in the
440 tail of a tentative distribution.
441
442 """
443
444
445
446
447
448
449 import scipy
450
452 """Helper proxy-class to fit distribution when some parameters are known
453
454 It is an ugly hack with snippets of code taken from scipy, which is
455 Copyright (c) 2001, 2002 Enthought, Inc.
456 and is distributed under BSD license
457 http://www.scipy.org/License_Compatibility
458 """
459
460 - def __init__(self, dist, loc=None, scale=None, args=None):
461
462 self._dist = dist
463
464 theta = (loc, scale)
465
466 Narg_ = dist.numargs
467 if args is not None:
468 Narg = len(args)
469 if Narg > Narg_:
470 raise ValueError, \
471 'Distribution %s has only %d arguments. Got %d' \
472 % (dist, Narg_, Narg)
473 args += (None,) * (Narg_ - Narg)
474 else:
475 args = (None,) * Narg_
476
477 args_i = [i for i,v in enumerate(args) if v is None]
478 self._fargs = (list(args+theta), args_i)
479 """Arguments which should get some fixed value"""
480
481
482 - def nnlf(self, theta, x):
483
484
485
486 fargs, fargs_i = self._fargs
487 try:
488 i=-1
489 if fargs[-1] is not None:
490 scale = fargs[-1]
491 else:
492 scale = theta[i]
493 i -= 1
494
495 if fargs[-2] is not None:
496 loc = fargs[-2]
497 else:
498 loc = theta[i]
499 i -= 1
500
501 args = theta[:i+1]
502
503 for i,a in zip(fargs_i, args):
504 fargs[i] = a
505 args = fargs[:-2]
506
507 except IndexError:
508 raise ValueError, "Not enough input arguments."
509 if not self._argcheck(*args) or scale <= 0:
510 return N.inf
511 x = N.asarray((x-loc) / scale)
512 cond0 = (x <= self.a) | (x >= self.b)
513 if (N.any(cond0)):
514 return N.inf
515 else:
516 return self._nnlf(x, *args) + len(x)*N.log(scale)
517
518 - def fit(self, data, *args, **kwds):
519 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0])
520 fargs, fargs_i = self._fargs
521 Narg = len(args)
522 Narg_ = self.numargs
523 if Narg != Narg_:
524 if Narg > Narg_:
525 raise ValueError, "Too many input arguments."
526 else:
527 args += (1.0,)*(self.numargs-Narg)
528
529
530
531 if len(fargs_i) != Narg_:
532 x0 = tuple([args[i] for i in fargs_i])
533 else:
534 x0 = args
535 if fargs[-2] is None: x0 = x0 + (loc0,)
536 if fargs[-1] is None: x0 = x0 + (scale0,)
537
538 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0)
539
540
541 i = 0
542 loc, scale = fargs[-2:]
543 if fargs[-1] is None:
544 i -= 1
545 scale = opt_x[i]
546 if fargs[-2] is None:
547 i -= 1
548 loc = opt_x[i]
549
550
551 for i in fargs_i:
552 fargs[i] = opt_x[i]
553
554
555 opt_x = N.hstack((fargs[:-2], (loc, scale)))
556 return opt_x
557
558
560 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
561 self._dist.__setattr__(a, v)
562 else:
563 object.__setattr__(self, a, v)
564
565
567 """We need to redirect all queries correspondingly
568 """
569 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
570 return getattr(self._dist, a)
571 else:
572 return object.__getattribute__(self, a)
573
574
575
576 - def matchDistribution(data, nsamples=None, loc=None, scale=None,
577 args=None, test='kstest', distributions=None,
578 **kwargs):
579 """Determine best matching distribution.
580
581 Can be used for 'smelling' the data, as well to choose a
582 parametric distribution for data obtained from non-parametric
583 testing (e.g. `MCNullDist`).
584
585 WiP: use with caution, API might change
586
587 :Parameters:
588 data : N.ndarray
589 Array of the data for which to deduce the distribution. It has
590 to be sufficiently large to make a reliable conclusion
591 nsamples : int or None
592 If None -- use all samples in data to estimate parametric
593 distribution. Otherwise use only specified number randomly selected
594 from data.
595 loc : float or None
596 Loc for the distribution (if known)
597 scale : float or None
598 Scale for the distribution (if known)
599 test : basestring
600 What kind of testing to do. Choices:
601 'p-roc' : detection power for a given ROC. Needs two
602 parameters: `p=0.05` and `tail='both'`
603 'kstest' : 'full-body' distribution comparison. The best
604 choice is made by minimal reported distance after estimating
605 parameters of the distribution. Parameter `p=0.05` sets
606 threshold to reject null-hypothesis that distribution is the
607 same.
608 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have
609 incorrect kstest implementation and do not function
610 properly
611 distributions : None or list of basestring or tuple(basestring, dict)
612 Distributions to check. If None, all known in scipy.stats
613 are tested. If distribution is specified as a tuple, then
614 it must contain name and additional parameters (name, loc,
615 scale, args) in the dictionary. Entry 'scipy' adds all known
616 in scipy.stats.
617 **kwargs
618 Additional arguments which are needed for each particular test
619 (see above)
620
621 :Example:
622 data = N.random.normal(size=(1000,1));
623 matches = matchDistribution(
624 data,
625 distributions=['rdist',
626 ('rdist', {'name':'rdist_fixed',
627 'loc': 0.0,
628 'args': (10,)})],
629 nsamples=30, test='p-roc', p=0.05)
630 """
631
632
633 _KNOWN_TESTS = ['p-roc', 'kstest']
634 if not test in _KNOWN_TESTS:
635 raise ValueError, 'Unknown kind of test %s. Known are %s' \
636 % (test, _KNOWN_TESTS)
637
638 data = N.ravel(data)
639
640 if nsamples is not None:
641 if __debug__:
642 debug('STAT', 'Sampling %d samples from data for the ' \
643 'estimation of the distributions parameters' % nsamples)
644 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int)
645 data_selected = data[indexes_selected]
646 else:
647 indexes_selected = N.arange(len(data))
648 data_selected = data
649
650 p_thr = kwargs.get('p', 0.05)
651 if test == 'p-roc':
652 tail = kwargs.get('tail', 'both')
653 data_p = _pvalue(data, Nonparametric(data).cdf, tail)
654 data_p_thr = N.abs(data_p) <= p_thr
655 true_positives = N.sum(data_p_thr)
656 if true_positives == 0:
657 raise ValueError, "Provided data has no elements in non-" \
658 "parametric distribution under p<=%.3f. Please " \
659 "increase the size of data or value of p" % p
660 if __debug__:
661 debug('STAT_', 'Number of positives in non-parametric '
662 'distribution is %d' % true_positives)
663
664 if distributions is None:
665 distributions = ['scipy']
666
667
668 try:
669 scipy_ind = distributions.index('scipy')
670 distributions.pop(scipy_ind)
671 distributions += scipy.stats.distributions.__all__
672 except ValueError:
673 pass
674
675 results = []
676 for d in distributions:
677 dist_gen, loc_, scale_, args_ = None, loc, scale, args
678 if isinstance(d, basestring):
679 dist_gen = d
680 dist_name = d
681 elif isinstance(d, tuple):
682 if not (len(d)==2 and isinstance(d[1], dict)):
683 raise ValueError,\
684 "Tuple specification of distribution must be " \
685 "(d, {params}). Got %s" % (d,)
686 dist_gen = d[0]
687 loc_ = d[1].get('loc', loc)
688 scale_ = d[1].get('scale', scale)
689 args_ = d[1].get('args', args)
690 dist_name = d[1].get('name', str(dist_gen))
691 else:
692 dist_gen = d
693 dist_name = str(d)
694
695
696 try:
697 dist_gen_ = getattr(scipy.stats, dist_gen)
698
699
700 if args_ is not None or loc_ is not None or scale_ is not None:
701 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_)
702 else:
703 dist_opt = dist_gen_
704 dist_params = dist_opt.fit(data_selected)
705 if __debug__:
706 debug('STAT__',
707 'Got distribution parameters %s for %s'
708 % (dist_params, dist_name))
709 if test == 'p-roc':
710 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params)
711
712 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen))
713 cdf_p_thr = cdf_p <= p_thr
714 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1
715 if __debug__: res_sum = 'D=%.2f' % D
716 elif test == 'kstest':
717 D, p = kstest(data, d, args=dist_params)
718 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p)
719 except (TypeError, ValueError, AttributeError,
720 NotImplementedError), e:
721 if __debug__:
722 debug('STAT__',
723 'Testing for %s distribution failed due to %s'
724 % (d, str(e)))
725 continue
726
727 if p > p_thr and not N.isnan(D):
728 results += [ (D, dist_gen, dist_name, dist_params) ]
729 if __debug__:
730 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum))
731 else:
732 if __debug__:
733 debug('STAT__', 'Cannot consider %s dist. with %s'
734 % (d, res_sum))
735 continue
736
737
738 results.sort()
739
740 if __debug__ and 'STAT' in debug.active:
741
742 nresults = len(results)
743 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0])
744 if nresults>0:
745 nnextbest = min(2, nresults-1)
746 snextbest = ', '.join(map(sresult, results[1:1+nnextbest]))
747 debug('STAT', 'Best distribution %s. Next best: %s'
748 % (sresult(results[0]), snextbest))
749 else:
750 debug('STAT', 'Could not find suitable distribution')
751
752
753 return results
754
755
756 if externals.exists('pylab'):
757 import pylab as P
758
759 - def plotDistributionMatches(data, matches, nbins=31, nbest=5,
760 expand_tails=8, legend=2, plot_cdf=True,
761 p=None, tail='both'):
762 """Plot best matching distributions
763
764 :Parameters:
765 data : N.ndarray
766 Data which was used to obtain the matches
767 matches : list of tuples
768 Sorted matches as provided by matchDistribution
769 nbins : int
770 Number of bins in the histogram
771 nbest : int
772 Number of top matches to plot
773 expand_tails : int
774 How many bins away to add to parametrized distributions
775 plots
776 legend : int
777 Either to provide legend and statistics in the legend.
778 1 -- just lists distributions.
779 2 -- adds distance measure
780 3 -- tp/fp/fn in the case if p is provided
781 plot_cdf : bool
782 Either to plot cdf for data using non-parametric distribution
783 p : float or None
784 If not None, visualize null-hypothesis testing (given p).
785 Bars in the histogram which fall under given p are colored
786 in red. False positives and false negatives are marked as
787 triangle up and down symbols correspondingly
788 tail : ('left', 'right', 'any', 'both')
789 If p is not None, the choise of tail for null-hypothesis
790 testing
791
792 :Returns: tuple(histogram, list of lines)
793 """
794
795 hist = P.hist(data, nbins, normed=1, align='center')
796 data_range = [N.min(data), N.max(data)]
797
798
799 x = hist[1]
800 dx = x[expand_tails] - x[0]
801 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx))
802
803 nonparam = Nonparametric(data)
804
805 if plot_cdf:
806 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1)
807
808 p_thr = p
809
810 data_p = _pvalue(data, nonparam.cdf, tail)
811 data_p_thr = (data_p <= p_thr).ravel()
812
813 x_p = _pvalue(x, Nonparametric(data).cdf, tail)
814 x_p_thr = N.abs(x_p) <= p_thr
815
816 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]):
817 bar.set_facecolor(('w','r')[int(thr)])
818
819 if not len(matches):
820
821 warning("No matching distributions were provided -- nothing to plot")
822 return (hist, )
823
824 lines = []
825 labels = []
826 for i in xrange(min(nbest, len(matches))):
827 D, dist_gen, dist_name, params = matches[i]
828 dist = getattr(scipy.stats, dist_gen)(*params)
829
830 label = '%s' % (dist_name)
831 if legend > 1: label += '(D=%.2f)' % (D)
832
833 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail))
834 xcdf_p_thr = (xcdf_p <= p_thr).ravel()
835
836 if p is not None and legend > 2:
837
838 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail))
839 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel()
840
841
842 tp = N.logical_and(data_cdf_p_thr, data_p_thr)
843
844 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr)
845
846 fn = N.logical_and(~data_cdf_p_thr, data_p_thr)
847
848 label += ' tp/fp/fn=%d/%d/%d)' % \
849 tuple(map(N.sum, [tp,fp,fn]))
850
851 pdf = dist.pdf(x)
852 line = P.plot(x, pdf, '-', linewidth=2, label=label)
853 color = line[0].get_color()
854
855 if plot_cdf:
856 cdf = dist.cdf(x)
857 P.plot(x, cdf, ':', linewidth=1, color=color, label=label)
858
859
860
861
862
863 if p is not None:
864
865 xtp = N.logical_and(xcdf_p_thr, x_p_thr)
866
867 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr)
868
869 xfn = N.logical_and(~xcdf_p_thr, x_p_thr)
870
871
872
873 P.plot(x[xfp], pdf[xfp], '^', color=color)
874 P.plot(x[xfn], pdf[xfn], 'v', color=color)
875
876 lines.append(line)
877 labels.append(label)
878
879 if legend:
880 P.legend(lines, labels)
881
882 return (hist, lines)
883
884
885
886
887
888
889
890
891
892
893
894
895
896
898 """Cheater for human beings -- wraps `dist` if needed with some
899 NullDist
900
901 tail and other arguments are assumed to be default as in
902 NullDist/MCNullDist
903 """
904 if dist is None or isinstance(dist, NullDist):
905 return dist
906 elif hasattr(dist, 'fit'):
907 if __debug__:
908 debug('STAT', 'Wrapping %s into MCNullDist' % dist)
909 return MCNullDist(dist)
910 else:
911 if __debug__:
912 debug('STAT', 'Wrapping %s into FixedNullDist' % dist)
913 return FixedNullDist(dist)
914
915
916
918 if axis is None:
919 a = N.ravel(a)
920 outaxis = 0
921 else:
922 a = N.asarray(a)
923 outaxis = axis
924 return a, outaxis
925
927 """Compute the mean over the given axis ignoring nans.
928
929 :Parameters:
930 x : ndarray
931 input array
932 axis : int
933 axis along which the mean is computed.
934
935 :Results:
936 m : float
937 the mean."""
938 x, axis = _chk_asarray(x,axis)
939 x = x.copy()
940 Norig = x.shape[axis]
941 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig
942
943 x[N.isnan(x)] = 0
944 return N.mean(x,axis)/factor
945