Package mvpa :: Package clfs :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.stats

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Estimator for classifier error distributions.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15  from mvpa.base import externals, warning 
 16  from mvpa.misc.state import Stateful, StateVariable 
 17   
 18  if __debug__: 
 19      from mvpa.base import debug 
 20   
 21   
22 -class Nonparametric(object):
23 """Non-parametric 1d distribution -- derives cdf based on stored values. 24 25 Introduced to complement parametric distributions present in scipy.stats. 26 """ 27
28 - def __init__(self, dist_samples):
29 self._dist_samples = N.ravel(dist_samples)
30 31 32 @staticmethod
33 - def fit(dist_samples):
34 return [dist_samples]
35 36
37 - def cdf(self, x):
38 """Returns the cdf value at `x`. 39 """ 40 return N.vectorize(lambda v:(self._dist_samples <= v).mean())(x)
41 42
43 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
44 """Helper function to return p-value(x) given cdf and tail 45 46 :Parameters: 47 cdf_func : callable 48 Function to be used to derive cdf values for x 49 tail : str ('left', 'right', 'any', 'both') 50 Which tail of the distribution to report. For 'any' and 'both' 51 it chooses the tail it belongs to based on the comparison to 52 p=0.5. In the case of 'any' significance is taken like in a 53 one-tailed test. 54 return_tails : bool 55 If True, a tuple return (pvalues, tails), where tails contain 56 1s if value was from the right tail, and 0 if the value was 57 from the left tail. 58 """ 59 is_scalar = N.isscalar(x) 60 if is_scalar: 61 x = [x] 62 63 cdf = cdf_func(x) 64 65 if __debug__ and 'CHECK_STABILITY' in debug.active: 66 cdf_min, cdf_max = N.min(cdf), N.max(cdf) 67 if cdf_min < 0 or cdf_max > 1.0: 68 s = ('', ' for %s' % name)[int(name is not None)] 69 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \ 70 (cdf_func, s, cdf_min, cdf_max)) 71 72 # no escape but to assure that CDF is in the right range. Some 73 # distributions from scipy tend to jump away from [0,1] 74 cdf = N.clip(cdf, 0, 1.0) 75 76 if tail == 'left': 77 if return_tails: 78 right_tail = N.zeros(cdf.shape, dtype=bool) 79 elif tail == 'right': 80 cdf = 1 - cdf 81 if return_tails: 82 right_tail = N.ones(cdf.shape, dtype=bool) 83 elif tail in ('any', 'both'): 84 right_tail = (cdf >= 0.5) 85 cdf[right_tail] = 1.0 - cdf[right_tail] 86 if tail == 'both': 87 # we need to half the signficance 88 cdf *= 2 89 90 # Assure that NaNs didn't get significant value 91 cdf[N.isnan(x)] = 1.0 92 if is_scalar: res = cdf[0] 93 else: res = cdf 94 95 if return_tails: 96 return (res, right_tail) 97 else: 98 return res
99 100
101 -class NullDist(Stateful):
102 """Base class for null-hypothesis testing. 103 104 """ 105 106 # Although base class is not benefiting from states, derived 107 # classes do (MCNullDist). For the sake of avoiding multiple 108 # inheritance and associated headache -- let them all be Stateful, 109 # performance hit should be negligible in most of the scenarios 110 _ATTRIBUTE_COLLECTIONS = ['states'] 111
112 - def __init__(self, tail='both', **kwargs):
113 """Cheap initialization. 114 115 :Parameter: 116 tail: str ('left', 'right', 'any', 'both') 117 Which tail of the distribution to report. For 'any' and 'both' 118 it chooses the tail it belongs to based on the comparison to 119 p=0.5. In the case of 'any' significance is taken like in a 120 one-tailed test. 121 """ 122 Stateful.__init__(self, **kwargs) 123 124 self._setTail(tail)
125 126
127 - def _setTail(self, tail):
128 # sanity check 129 if tail not in ('left', 'right', 'any', 'both'): 130 raise ValueError, 'Unknown value "%s" to `tail` argument.' \ 131 % tail 132 self.__tail = tail
133 134
135 - def fit(self, measure, wdata, vdata=None):
136 """Implement to fit the distribution to the data.""" 137 raise NotImplementedError
138 139
140 - def cdf(self, x):
141 """Implementations return the value of the cumulative distribution 142 function (left or right tail dpending on the setting). 143 """ 144 raise NotImplementedError
145 146
147 - def p(self, x, **kwargs):
148 """Returns the p-value for values of `x`. 149 Returned values are determined left, right, or from any tail 150 depending on the constructor setting. 151 152 In case a `FeaturewiseDatasetMeasure` was used to estimate the 153 distribution the method returns an array. In that case `x` can be 154 a scalar value or an array of a matching shape. 155 """ 156 return _pvalue(x, self.cdf, self.__tail, **kwargs)
157 158 159 tail = property(fget=lambda x:x.__tail, fset=_setTail)
160 161
162 -class MCNullDist(NullDist):
163 """Null-hypothesis distribution is estimated from randomly permuted data labels. 164 165 The distribution is estimated by calling fit() with an appropriate 166 `DatasetMeasure` or `TransferError` instance and a training and a 167 validation dataset (in case of a `TransferError`). For a customizable 168 amount of cycles the training data labels are permuted and the 169 corresponding measure computed. In case of a `TransferError` this is the 170 error when predicting the *correct* labels of the validation dataset. 171 172 The distribution can be queried using the `cdf()` method, which can be 173 configured to report probabilities/frequencies from `left` or `right` tail, 174 i.e. fraction of the distribution that is lower or larger than some 175 critical value. 176 177 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` 178 returns an array of featurewise probabilities/frequencies. 179 """ 180 181 _DEV_DOC = """ 182 TODO automagically decide on the number of samples/permutations needed 183 Caution should be paid though since resultant distributions might be 184 quite far from some conventional ones (e.g. Normal) -- it is expected to 185 them to be bimodal (or actually multimodal) in many scenarios. 186 """ 187 188 dist_samples = StateVariable(enabled=False, 189 doc='Samples obtained for each permutation') 190
191 - def __init__(self, dist_class=Nonparametric, permutations=100, **kwargs):
192 """Initialize Monte-Carlo Permutation Null-hypothesis testing 193 194 :Parameter: 195 dist_class: class 196 This can be any class which provides parameters estimate 197 using `fit()` method to initialize the instance, and 198 provides `cdf(x)` method for estimating value of x in CDF. 199 All distributions from SciPy's 'stats' module can be used. 200 permutations: int 201 This many permutations of label will be performed to 202 determine the distribution under the null hypothesis. 203 """ 204 NullDist.__init__(self, **kwargs) 205 206 self._dist_class = dist_class 207 self._dist = [] # actual distributions 208 209 self.__permutations = permutations 210 """Number of permutations to compute the estimate the null 211 distribution."""
212 213 214
215 - def fit(self, measure, wdata, vdata=None):
216 """Fit the distribution by performing multiple cycles which repeatedly 217 permuted labels in the training dataset. 218 219 :Parameter: 220 measure: (`Featurewise`)`DatasetMeasure` | `TransferError` 221 TransferError instance used to compute all errors. 222 wdata: `Dataset` which gets permuted and used to compute the 223 measure/transfer error multiple times. 224 vdata: `Dataset` used for validation. 225 If provided measure is assumed to be a `TransferError` and 226 working and validation dataset are passed onto it. 227 """ 228 dist_samples = [] 229 """Holds the values for randomized labels.""" 230 231 # decide on the arguments to measure 232 if not vdata is None: 233 measure_args = [vdata, wdata] 234 else: 235 measure_args = [wdata] 236 237 # estimate null-distribution 238 for p in xrange(self.__permutations): 239 # new permutation all the time 240 # but only permute the training data and keep the testdata constant 241 # 242 # TODO this really needs to be more clever! If data samples are 243 # shuffled within a class it really makes no difference for the 244 # classifier, hence the number of permutations to estimate the 245 # null-distribution of transfer errors can be reduced dramatically 246 # when the *right* permutations (the ones that matter) are done. 247 wdata.permuteLabels(True, perchunk=False) 248 249 # compute and store the measure of this permutation 250 # assume it has `TransferError` interface 251 dist_samples.append(measure(*measure_args)) 252 253 # restore original labels 254 wdata.permuteLabels(False, perchunk=False) 255 256 # store samples 257 self.dist_samples = dist_samples = N.asarray(dist_samples) 258 259 # fit distribution per each element 260 261 # to decide either it was done on scalars or vectors 262 shape = dist_samples.shape 263 nshape = len(shape) 264 # if just 1 dim, original data was scalar, just create an 265 # artif dimension for it 266 if nshape == 1: 267 dist_samples = dist_samples[:, N.newaxis] 268 269 # fit per each element. 270 # XXX could be more elegant? may be use N.vectorize? 271 dist_samples_rs = dist_samples.reshape((shape[0], -1)) 272 dist = [] 273 for samples in dist_samples_rs.T: 274 params = self._dist_class.fit(samples) 275 if __debug__ and 'STAT' in debug.active: 276 debug('STAT', 'Estimated parameters for the %s are %s' 277 % (self._dist_class, str(params))) 278 dist.append(self._dist_class(*params)) 279 self._dist = dist
280 281
282 - def cdf(self, x):
283 """Return value of the cumulative distribution function at `x`. 284 """ 285 if self._dist is None: 286 # XXX We might not want to descriminate that way since 287 # usually generators also have .cdf where they rely on the 288 # default parameters. But then what about Nonparametric 289 raise RuntimeError, "Distribution has to be fit first" 290 291 is_scalar = N.isscalar(x) 292 if is_scalar: 293 x = [x] 294 295 x = N.asanyarray(x) 296 xshape = x.shape 297 # assure x is a 1D array now 298 x = x.reshape((-1,)) 299 300 if len(self._dist) != len(x): 301 raise ValueError, 'Distribution was fit for structure with %d' \ 302 ' elements, whenever now queried with %d elements' \ 303 % (len(self._dist), len(x)) 304 305 # extract cdf values per each element 306 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ] 307 return N.array(cdfs).reshape(xshape)
308 309
310 - def clean(self):
311 """Clean stored distributions 312 313 Storing all of the distributions might be too expensive 314 (e.g. in case of Nonparametric), and the scope of the object 315 might be too broad to wait for it to be destroyed. Clean would 316 bind dist_samples to empty list to let gc revoke the memory. 317 """ 318 self._dist = []
319 320 321
322 -class FixedNullDist(NullDist):
323 """Proxy/Adaptor class for SciPy distributions. 324 325 All distributions from SciPy's 'stats' module can be used with this class. 326 327 >>> import numpy as N 328 >>> from scipy import stats 329 >>> from mvpa.clfs.stats import FixedNullDist 330 >>> 331 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4)) 332 >>> dist.p(2) 333 0.5 334 >>> 335 >>> dist.cdf(N.arange(5)) 336 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246]) 337 >>> 338 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right') 339 >>> dist.p(N.arange(5)) 340 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754]) 341 """
342 - def __init__(self, dist, **kwargs):
343 """ 344 :Parameter: 345 dist: distribution object 346 This can be any object the has a `cdf()` method to report the 347 cumulative distribition function values. 348 """ 349 NullDist.__init__(self, **kwargs) 350 351 self._dist = dist
352 353
354 - def fit(self, measure, wdata, vdata=None):
355 """Does nothing since the distribution is already fixed.""" 356 pass
357 358
359 - def cdf(self, x):
360 """Return value of the cumulative distribution function at `x`. 361 """ 362 return self._dist.cdf(x)
363 364
365 -class AdaptiveNullDist(FixedNullDist):
366 """Adaptive distribution which adjusts parameters according to the data 367 368 WiP: internal implementation might change 369 """
370 - def fit(self, measure, wdata, vdata=None):
371 """Cares about dimensionality of the feature space in measure 372 """ 373 374 try: 375 nfeatures = len(measure.feature_ids) 376 except ValueError: # XXX 377 nfeatures = N.prod(wdata.shape[1:]) 378 379 dist_gen = self._dist 380 if not hasattr(dist_gen, 'fit'): # frozen already 381 dist_gen = dist_gen.dist # rv_frozen at least has it ;) 382 383 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata) 384 if __debug__: 385 debug('STAT', 'Adapted parameters for %s to be %s, %s' 386 % (dist_gen, args, kwargs)) 387 self._dist = dist_gen(*args, **kwargs)
388 389
390 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
391 raise NotImplementedError
392 393
394 -class AdaptiveRDist(AdaptiveNullDist):
395 """Adaptive rdist: params are (nfeatures-1, 0, 1) 396 """ 397
398 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
399 return (nfeatures-1, 0, 1), {}
400 401 # XXX: RDist has stability issue, just run 402 # python -c "import scipy.stats; print scipy.stats.rdist(541,0,1).cdf(0.72)" 403 # to get some improbable value, so we need to take care about that manually 404 # here
405 - def cdf(self, x):
406 cdf_ = self._dist.cdf(x) 407 bad_values = N.where(N.abs(cdf_)>1) 408 # XXX there might be better implementation (faster/elegant) using N.clip, 409 # the only problem is that instability results might flip the sign 410 # arbitrarily 411 if len(bad_values[0]): 412 # in this distribution we have mean at 0, so we can take that easily 413 # into account 414 cdf_bad = cdf_[bad_values] 415 x_bad = x[bad_values] 416 cdf_bad[x_bad<0] = 0.0 417 cdf_bad[x_bad>=0] = 1.0 418 cdf_[bad_values] = cdf_bad 419 return cdf_
420 421
422 -class AdaptiveNormal(AdaptiveNullDist):
423 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures)) 424 """ 425
426 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
427 return (0, 1.0/N.sqrt(nfeatures)), {}
428 429 430 if externals.exists('scipy'): 431 from mvpa.support.stats import scipy 432 from scipy.stats import kstest 433 """ 434 Thoughts: 435 436 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to 437 check/reject H0 that samples come from a given distribution. But 438 since it is based on a full range of data, we might better of with 439 some ad-hoc checking by the detection power of the values in the 440 tail of a tentative distribution. 441 442 """ 443 444 # We need a way to fixate estimation of some parameters 445 # (e.g. mean) so lets create a simple proxy, or may be class 446 # generator later on, which would take care about punishing change 447 # from the 'right' arguments 448 449 import scipy 450
451 - class rv_semifrozen(object):
452 """Helper proxy-class to fit distribution when some parameters are known 453 454 It is an ugly hack with snippets of code taken from scipy, which is 455 Copyright (c) 2001, 2002 Enthought, Inc. 456 and is distributed under BSD license 457 http://www.scipy.org/License_Compatibility 458 """ 459
460 - def __init__(self, dist, loc=None, scale=None, args=None):
461 462 self._dist = dist 463 # loc and scale 464 theta = (loc, scale) 465 # args 466 Narg_ = dist.numargs 467 if args is not None: 468 Narg = len(args) 469 if Narg > Narg_: 470 raise ValueError, \ 471 'Distribution %s has only %d arguments. Got %d' \ 472 % (dist, Narg_, Narg) 473 args += (None,) * (Narg_ - Narg) 474 else: 475 args = (None,) * Narg_ 476 477 args_i = [i for i,v in enumerate(args) if v is None] 478 self._fargs = (list(args+theta), args_i) 479 """Arguments which should get some fixed value"""
480 481
482 - def nnlf(self, theta, x):
483 # - sum (log pdf(x, theta),axis=0) 484 # where theta are the parameters (including loc and scale) 485 # 486 fargs, fargs_i = self._fargs 487 try: 488 i=-1 489 if fargs[-1] is not None: 490 scale = fargs[-1] 491 else: 492 scale = theta[i] 493 i -= 1 494 495 if fargs[-2] is not None: 496 loc = fargs[-2] 497 else: 498 loc = theta[i] 499 i -= 1 500 501 args = theta[:i+1] 502 # adjust args if there were fixed 503 for i,a in zip(fargs_i, args): 504 fargs[i] = a 505 args = fargs[:-2] 506 507 except IndexError: 508 raise ValueError, "Not enough input arguments." 509 if not self._argcheck(*args) or scale <= 0: 510 return N.inf 511 x = N.asarray((x-loc) / scale) 512 cond0 = (x <= self.a) | (x >= self.b) 513 if (N.any(cond0)): 514 return N.inf 515 else: 516 return self._nnlf(x, *args) + len(x)*N.log(scale)
517
518 - def fit(self, data, *args, **kwds):
519 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0]) 520 fargs, fargs_i = self._fargs 521 Narg = len(args) 522 Narg_ = self.numargs 523 if Narg != Narg_: 524 if Narg > Narg_: 525 raise ValueError, "Too many input arguments." 526 else: 527 args += (1.0,)*(self.numargs-Narg) 528 529 # Provide only those args which are not fixed, and 530 # append location and scale (if not fixed) at the end 531 if len(fargs_i) != Narg_: 532 x0 = tuple([args[i] for i in fargs_i]) 533 else: 534 x0 = args 535 if fargs[-2] is None: x0 = x0 + (loc0,) 536 if fargs[-1] is None: x0 = x0 + (scale0,) 537 538 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0) 539 540 # reconstruct back 541 i = 0 542 loc, scale = fargs[-2:] 543 if fargs[-1] is None: 544 i -= 1 545 scale = opt_x[i] 546 if fargs[-2] is None: 547 i -= 1 548 loc = opt_x[i] 549 550 # assign those which weren't fixed 551 for i in fargs_i: 552 fargs[i] = opt_x[i] 553 554 #raise ValueError 555 opt_x = N.hstack((fargs[:-2], (loc, scale))) 556 return opt_x
557 558
559 - def __setattr__(self, a, v):
560 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 561 self._dist.__setattr__(a, v) 562 else: 563 object.__setattr__(self, a, v)
564 565
566 - def __getattribute__(self, a):
567 """We need to redirect all queries correspondingly 568 """ 569 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 570 return getattr(self._dist, a) 571 else: 572 return object.__getattribute__(self, a)
573 574 575
576 - def matchDistribution(data, nsamples=None, loc=None, scale=None, 577 args=None, test='kstest', distributions=None, 578 **kwargs):
579 """Determine best matching distribution. 580 581 Can be used for 'smelling' the data, as well to choose a 582 parametric distribution for data obtained from non-parametric 583 testing (e.g. `MCNullDist`). 584 585 WiP: use with caution, API might change 586 587 :Parameters: 588 data : N.ndarray 589 Array of the data for which to deduce the distribution. It has 590 to be sufficiently large to make a reliable conclusion 591 nsamples : int or None 592 If None -- use all samples in data to estimate parametric 593 distribution. Otherwise use only specified number randomly selected 594 from data. 595 loc : float or None 596 Loc for the distribution (if known) 597 scale : float or None 598 Scale for the distribution (if known) 599 test : basestring 600 What kind of testing to do. Choices: 601 'p-roc' : detection power for a given ROC. Needs two 602 parameters: `p=0.05` and `tail='both'` 603 'kstest' : 'full-body' distribution comparison. The best 604 choice is made by minimal reported distance after estimating 605 parameters of the distribution. Parameter `p=0.05` sets 606 threshold to reject null-hypothesis that distribution is the 607 same. 608 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have 609 incorrect kstest implementation and do not function 610 properly 611 distributions : None or list of basestring or tuple(basestring, dict) 612 Distributions to check. If None, all known in scipy.stats 613 are tested. If distribution is specified as a tuple, then 614 it must contain name and additional parameters (name, loc, 615 scale, args) in the dictionary. Entry 'scipy' adds all known 616 in scipy.stats. 617 **kwargs 618 Additional arguments which are needed for each particular test 619 (see above) 620 621 :Example: 622 data = N.random.normal(size=(1000,1)); 623 matches = matchDistribution( 624 data, 625 distributions=['rdist', 626 ('rdist', {'name':'rdist_fixed', 627 'loc': 0.0, 628 'args': (10,)})], 629 nsamples=30, test='p-roc', p=0.05) 630 """ 631 632 # Handle parameters 633 _KNOWN_TESTS = ['p-roc', 'kstest'] 634 if not test in _KNOWN_TESTS: 635 raise ValueError, 'Unknown kind of test %s. Known are %s' \ 636 % (test, _KNOWN_TESTS) 637 638 data = N.ravel(data) 639 # data sampled 640 if nsamples is not None: 641 if __debug__: 642 debug('STAT', 'Sampling %d samples from data for the ' \ 643 'estimation of the distributions parameters' % nsamples) 644 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int) 645 data_selected = data[indexes_selected] 646 else: 647 indexes_selected = N.arange(len(data)) 648 data_selected = data 649 650 p_thr = kwargs.get('p', 0.05) 651 if test == 'p-roc': 652 tail = kwargs.get('tail', 'both') 653 data_p = _pvalue(data, Nonparametric(data).cdf, tail) 654 data_p_thr = N.abs(data_p) <= p_thr 655 true_positives = N.sum(data_p_thr) 656 if true_positives == 0: 657 raise ValueError, "Provided data has no elements in non-" \ 658 "parametric distribution under p<=%.3f. Please " \ 659 "increase the size of data or value of p" % p 660 if __debug__: 661 debug('STAT_', 'Number of positives in non-parametric ' 662 'distribution is %d' % true_positives) 663 664 if distributions is None: 665 distributions = ['scipy'] 666 667 # lets see if 'scipy' entry was in there 668 try: 669 scipy_ind = distributions.index('scipy') 670 distributions.pop(scipy_ind) 671 distributions += scipy.stats.distributions.__all__ 672 except ValueError: 673 pass 674 675 results = [] 676 for d in distributions: 677 dist_gen, loc_, scale_, args_ = None, loc, scale, args 678 if isinstance(d, basestring): 679 dist_gen = d 680 dist_name = d 681 elif isinstance(d, tuple): 682 if not (len(d)==2 and isinstance(d[1], dict)): 683 raise ValueError,\ 684 "Tuple specification of distribution must be " \ 685 "(d, {params}). Got %s" % (d,) 686 dist_gen = d[0] 687 loc_ = d[1].get('loc', loc) 688 scale_ = d[1].get('scale', scale) 689 args_ = d[1].get('args', args) 690 dist_name = d[1].get('name', str(dist_gen)) 691 else: 692 dist_gen = d 693 dist_name = str(d) 694 695 # perform actions which might puke for some distributions 696 try: 697 dist_gen_ = getattr(scipy.stats, dist_gen) 698 # specify distribution 'optimizer'. If loc or scale was provided, 699 # use home-brewed rv_semifrozen 700 if args_ is not None or loc_ is not None or scale_ is not None: 701 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_) 702 else: 703 dist_opt = dist_gen_ 704 dist_params = dist_opt.fit(data_selected) 705 if __debug__: 706 debug('STAT__', 707 'Got distribution parameters %s for %s' 708 % (dist_params, dist_name)) 709 if test == 'p-roc': 710 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params) 711 # We need to compare detection under given p 712 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen)) 713 cdf_p_thr = cdf_p <= p_thr 714 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1 715 if __debug__: res_sum = 'D=%.2f' % D 716 elif test == 'kstest': 717 D, p = kstest(data, d, args=dist_params) 718 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p) 719 except (TypeError, ValueError, AttributeError, 720 NotImplementedError), e:#Exception, e: 721 if __debug__: 722 debug('STAT__', 723 'Testing for %s distribution failed due to %s' 724 % (d, str(e))) 725 continue 726 727 if p > p_thr and not N.isnan(D): 728 results += [ (D, dist_gen, dist_name, dist_params) ] 729 if __debug__: 730 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum)) 731 else: 732 if __debug__: 733 debug('STAT__', 'Cannot consider %s dist. with %s' 734 % (d, res_sum)) 735 continue 736 737 # sort in ascending order, so smaller is better 738 results.sort() 739 740 if __debug__ and 'STAT' in debug.active: 741 # find the best and report it 742 nresults = len(results) 743 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0]) 744 if nresults>0: 745 nnextbest = min(2, nresults-1) 746 snextbest = ', '.join(map(sresult, results[1:1+nnextbest])) 747 debug('STAT', 'Best distribution %s. Next best: %s' 748 % (sresult(results[0]), snextbest)) 749 else: 750 debug('STAT', 'Could not find suitable distribution') 751 752 # return all the results 753 return results
754 755 756 if externals.exists('pylab'): 757 import pylab as P 758
759 - def plotDistributionMatches(data, matches, nbins=31, nbest=5, 760 expand_tails=8, legend=2, plot_cdf=True, 761 p=None, tail='both'):
762 """Plot best matching distributions 763 764 :Parameters: 765 data : N.ndarray 766 Data which was used to obtain the matches 767 matches : list of tuples 768 Sorted matches as provided by matchDistribution 769 nbins : int 770 Number of bins in the histogram 771 nbest : int 772 Number of top matches to plot 773 expand_tails : int 774 How many bins away to add to parametrized distributions 775 plots 776 legend : int 777 Either to provide legend and statistics in the legend. 778 1 -- just lists distributions. 779 2 -- adds distance measure 780 3 -- tp/fp/fn in the case if p is provided 781 plot_cdf : bool 782 Either to plot cdf for data using non-parametric distribution 783 p : float or None 784 If not None, visualize null-hypothesis testing (given p). 785 Bars in the histogram which fall under given p are colored 786 in red. False positives and false negatives are marked as 787 triangle up and down symbols correspondingly 788 tail : ('left', 'right', 'any', 'both') 789 If p is not None, the choise of tail for null-hypothesis 790 testing 791 792 :Returns: tuple(histogram, list of lines) 793 """ 794 795 hist = P.hist(data, nbins, normed=1, align='center') 796 data_range = [N.min(data), N.max(data)] 797 798 # x's 799 x = hist[1] 800 dx = x[expand_tails] - x[0] # how much to expand tails by 801 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx)) 802 803 nonparam = Nonparametric(data) 804 # plot cdf 805 if plot_cdf: 806 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1) 807 808 p_thr = p 809 810 data_p = _pvalue(data, nonparam.cdf, tail) 811 data_p_thr = (data_p <= p_thr).ravel() 812 813 x_p = _pvalue(x, Nonparametric(data).cdf, tail) 814 x_p_thr = N.abs(x_p) <= p_thr 815 # color bars which pass thresholding in red 816 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]): 817 bar.set_facecolor(('w','r')[int(thr)]) 818 819 if not len(matches): 820 # no matches were provided 821 warning("No matching distributions were provided -- nothing to plot") 822 return (hist, ) 823 824 lines = [] 825 labels = [] 826 for i in xrange(min(nbest, len(matches))): 827 D, dist_gen, dist_name, params = matches[i] 828 dist = getattr(scipy.stats, dist_gen)(*params) 829 830 label = '%s' % (dist_name) 831 if legend > 1: label += '(D=%.2f)' % (D) 832 833 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail)) 834 xcdf_p_thr = (xcdf_p <= p_thr).ravel() 835 836 if p is not None and legend > 2: 837 # We need to compare detection under given p 838 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail)) 839 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel() 840 841 # true positives 842 tp = N.logical_and(data_cdf_p_thr, data_p_thr) 843 # false positives 844 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr) 845 # false negatives 846 fn = N.logical_and(~data_cdf_p_thr, data_p_thr) 847 848 label += ' tp/fp/fn=%d/%d/%d)' % \ 849 tuple(map(N.sum, [tp,fp,fn])) 850 851 pdf = dist.pdf(x) 852 line = P.plot(x, pdf, '-', linewidth=2, label=label) 853 color = line[0].get_color() 854 855 if plot_cdf: 856 cdf = dist.cdf(x) 857 P.plot(x, cdf, ':', linewidth=1, color=color, label=label) 858 859 # TODO: decide on tp/fp/fn by not centers of the bins but 860 # by the values in data in the ranges covered by 861 # those bins. Then it would correspond to the values 862 # mentioned in the legend 863 if p is not None: 864 # true positives 865 xtp = N.logical_and(xcdf_p_thr, x_p_thr) 866 # false positives 867 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr) 868 # false negatives 869 xfn = N.logical_and(~xcdf_p_thr, x_p_thr) 870 871 # no need to plot tp explicitely -- marked by color of the bar 872 # P.plot(x[xtp], pdf[xtp], 'o', color=color) 873 P.plot(x[xfp], pdf[xfp], '^', color=color) 874 P.plot(x[xfn], pdf[xfn], 'v', color=color) 875 876 lines.append(line) 877 labels.append(label) 878 879 if legend: 880 P.legend(lines, labels) 881 882 return (hist, lines)
883 884 #if True: 885 # data = N.random.normal(size=(1000,1)); 886 # matches = matchDistribution( 887 # data, 888 # distributions=['scipy', 889 # ('norm', {'name':'norm_known', 890 # 'scale': 1.0, 891 # 'loc': 0.0})], 892 # nsamples=30, test='p-roc', p=0.05) 893 # P.figure(); plotDistributionMatches(data, matches, nbins=101, 894 # p=0.05, legend=4, nbest=5) 895 896
897 -def autoNullDist(dist):
898 """Cheater for human beings -- wraps `dist` if needed with some 899 NullDist 900 901 tail and other arguments are assumed to be default as in 902 NullDist/MCNullDist 903 """ 904 if dist is None or isinstance(dist, NullDist): 905 return dist 906 elif hasattr(dist, 'fit'): 907 if __debug__: 908 debug('STAT', 'Wrapping %s into MCNullDist' % dist) 909 return MCNullDist(dist) 910 else: 911 if __debug__: 912 debug('STAT', 'Wrapping %s into FixedNullDist' % dist) 913 return FixedNullDist(dist)
914 915 916 # if no scipy, we need nanmean
917 -def _chk_asarray(a, axis):
918 if axis is None: 919 a = N.ravel(a) 920 outaxis = 0 921 else: 922 a = N.asarray(a) 923 outaxis = axis 924 return a, outaxis
925
926 -def nanmean(x, axis=0):
927 """Compute the mean over the given axis ignoring nans. 928 929 :Parameters: 930 x : ndarray 931 input array 932 axis : int 933 axis along which the mean is computed. 934 935 :Results: 936 m : float 937 the mean.""" 938 x, axis = _chk_asarray(x,axis) 939 x = x.copy() 940 Norig = x.shape[axis] 941 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig 942 943 x[N.isnan(x)] = 0 944 return N.mean(x,axis)/factor
945