Package mvpa :: Package misc :: Module transformers
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.transformers

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Simply functors that transform something.""" 
 10   
 11  _DEV_DOC = """ 
 12  Follow the convetion that functions start with lower case, and classes 
 13  with uppercase letter. 
 14  """ 
 15   
 16  __docformat__ = 'restructuredtext' 
 17   
 18   
 19  import numpy as N 
 20   
 21  from mvpa.base import externals, warning 
 22  from mvpa.misc.state import StateVariable, Stateful 
 23   
 24  if __debug__: 
 25      from mvpa.base import debug 
 26   
 27   
28 -def Absolute(x):
29 """Returns the elementwise absolute of any argument.""" 30 return N.absolute(x)
31 32
33 -def OneMinus(x):
34 """Returns elementwise '1 - x' of any argument.""" 35 return 1 - x
36 37
38 -def Identity(x):
39 """Return whatever it was called with.""" 40 return x
41 42
43 -def FirstAxisMean(x):
44 """Mean computed along the first axis.""" 45 return N.mean(x, axis=0)
46
47 -def FirstAxisSumNotZero(x):
48 """Sum computed over first axis of whether the values are not 49 equal to zero.""" 50 return (N.asarray(x)!=0).sum(axis=0)
51 52
53 -def SecondAxisMean(x):
54 """Mean across 2nd axis 55 56 Use cases: 57 - to combine multiple sensitivities to get sense about 58 mean sensitivity across splits 59 """ 60 return N.mean(x, axis=1)
61 62
63 -def SecondAxisSumOfAbs(x):
64 """Sum of absolute values along the 2nd axis 65 66 Use cases: 67 - to combine multiple sensitivities to get sense about 68 what features are most influential 69 """ 70 return N.abs(x).sum(axis=1)
71 72
73 -def SecondAxisMaxOfAbs(x):
74 """Max of absolute values along the 2nd axis 75 """ 76 return N.abs(x).max(axis=1)
77 78
79 -def GrandMean(x):
80 """Just what the name suggests.""" 81 return N.mean(x)
82 83
84 -def L2Normed(x, norm=1.0, reverse=False):
85 """Norm the values so that regular vector norm becomes `norm` 86 87 More verbose: Norm that the sum of the squared elements of the 88 returned vector becomes `norm`. 89 """ 90 xnorm = N.linalg.norm(x) 91 return x * (norm/xnorm)
92
93 -def L1Normed(x, norm=1.0, reverse=False):
94 """Norm the values so that L_1 norm (sum|x|) becomes `norm`""" 95 xnorm = N.sum(N.abs(x)) 96 return x * (norm/xnorm)
97 98
99 -def RankOrder(x, reverse=False):
100 """Rank-order by value. Highest gets 0""" 101 102 # XXX was Yarik on drugs? please simplify this beast 103 nelements = len(x) 104 indexes = N.arange(nelements) 105 t_indexes = indexes 106 if not reverse: 107 t_indexes = indexes[::-1] 108 tosort = zip(x, indexes) 109 tosort.sort() 110 ztosort = zip(tosort, t_indexes) 111 rankorder = N.empty(nelements, dtype=int) 112 rankorder[ [x[0][1] for x in ztosort] ] = \ 113 [x[1] for x in ztosort] 114 return rankorder
115 116
117 -def ReverseRankOrder(x):
118 """Convinience functor""" 119 return RankOrder(x, reverse=True)
120 121
122 -class OverAxis(object):
123 """Helper to apply transformer over specific axis 124 """ 125
126 - def __init__(self, transformer, axis=None):
127 """Initialize transformer wrapper with an axis. 128 129 :Parameters: 130 transformer 131 A callable to be used 132 axis : None or int 133 If None -- apply transformer across all the data. If some 134 int -- over that axis 135 """ 136 self.transformer = transformer 137 # sanity check 138 if not (axis is None or isinstance(axis, int)): 139 raise ValueError, "axis must be specified by integer" 140 self.axis = axis
141 142
143 - def __call__(self, x, *args, **kwargs):
144 transformer = self.transformer 145 axis = self.axis 146 if axis is None: 147 return transformer(x, *args, **kwargs) 148 149 x = N.asanyarray(x) 150 shape = x.shape 151 if axis >= len(shape): 152 raise ValueError, "Axis given in constructor %d is higher " \ 153 "than dimensionality of the data of shape %s" % (axis, shape) 154 155 # WRONG! ;-) 156 #for ind in xrange(shape[axis]): 157 # results.append(transformer(x.take([ind], axis=axis), 158 # *args, **kwargs)) 159 160 # TODO: more elegant/speedy solution? 161 shape_sweep = shape[:axis] + shape[axis+1:] 162 shrinker = None 163 """Either transformer reduces the dimensionality of the data""" 164 #results = N.empty(shape_out, dtype=x.dtype) 165 for index_sweep in N.ndindex(shape_sweep): 166 if axis > 0: 167 index = index_sweep[:axis] 168 else: 169 index = () 170 index = index + (slice(None),) + index_sweep[axis:] 171 x_sel = x[index] 172 x_t = transformer(x_sel, *args, **kwargs) 173 if shrinker is None: 174 if N.isscalar(x_t) or x_t.shape == shape_sweep: 175 results = N.empty(shape_sweep, dtype=x.dtype) 176 shrinker = True 177 elif x_t.shape == x_sel.shape: 178 results = N.empty(x.shape, dtype=x.dtype) 179 shrinker = False 180 else: 181 raise RuntimeError, 'Not handled by OverAxis kind of transformer' 182 183 if shrinker: 184 results[index_sweep] = x_t 185 else: 186 results[index] = x_t 187 188 return results
189 190
191 -class DistPValue(Stateful):
192 """Converts values into p-values under vague and non-scientific assumptions 193 """ 194 195 nulldist_number = StateVariable(enabled=True, 196 doc="Number of features within the estimated null-distribution") 197 198 positives_recovered = StateVariable(enabled=True, 199 doc="Number of features considered to be positives and which were recovered") 200 201
202 - def __init__(self, sd=0, distribution='rdist', fpp=None, nbins=400, **kwargs):
203 """L2-Norm the values, convert them to p-values of a given distribution. 204 205 :Parameters: 206 sd : int 207 Samples dimension (if len(x.shape)>1) on which to operate 208 distribution : string 209 Which distribution to use. Known are: 'rdist' (later normal should 210 be there as well) 211 fpp : float 212 At what p-value (both tails) if not None, to control for false 213 positives. It would iteratively prune the tails (tentative real positives) 214 until empirical p-value becomes less or equal to numerical. 215 nbins : int 216 Number of bins for the iterative pruning of positives 217 218 WARNING: Highly experimental/slow/etc: no theoretical grounds have been 219 presented in any paper, nor proven 220 """ 221 externals.exists('scipy', raiseException=True) 222 Stateful.__init__(self, **kwargs) 223 224 self.sd = sd 225 if not (distribution in ['rdist']): 226 raise ValueError, "Actually only rdist supported at the moment" \ 227 " got %s" % distribution 228 self.distribution = distribution 229 self.fpp = fpp 230 self.nbins = nbins
231 232
233 - def __call__(self, x):
234 from mvpa.support.stats import scipy 235 import scipy.stats as stats 236 237 # some local bindings 238 distribution = self.distribution 239 sd = self.sd 240 fpp = self.fpp 241 nbins = self.nbins 242 243 x = N.asanyarray(x) 244 shape_orig = x.shape 245 ndims = len(shape_orig) 246 247 # XXX May be just utilize OverAxis transformer? 248 if ndims > 2: 249 raise NotImplementedError, \ 250 "TODO: add support for more than 2 dimensions" 251 elif ndims == 1: 252 x, sd = x[:, N.newaxis], 0 253 254 # lets transpose for convenience 255 if sd == 0: x = x.T 256 257 # Output p-values of x in null-distribution 258 pvalues = N.zeros(x.shape) 259 nulldist_number, positives_recovered = [], [] 260 261 # finally go through all data 262 nd = x.shape[1] 263 if __debug__: 264 if nd < x.shape[0]: 265 warning("Number of features in DistPValue lower than number of" 266 " items -- may be incorrect sd=%d was provided" % sd) 267 dist = stats.rdist(nd-1, 0, 1) 268 for i, xx in enumerate(x): 269 xx /= N.linalg.norm(xx) 270 271 if fpp is not None: 272 # Adaptive adjustment for false negatives: 273 Nxx, xxx, pN_emp_prev = len(xx), xx, 1.0 274 Nxxx = Nxx 275 indexes = N.arange(Nxx) 276 """What features belong to Null-distribution""" 277 while True: 278 Nhist = N.histogram(xxx, bins=nbins, normed=False) 279 pdf = Nhist[0].astype(float)/Nxxx 280 bins = Nhist[1] 281 bins_halfstep = (bins[1] - bins[2])/2.0 282 283 # theoretical CDF 284 # was really unstable -- now got better ;) 285 dist_cdf = dist.cdf(bins) 286 287 # otherwise just recompute manually 288 # dist_pdf = dist.pdf(bins) 289 # dist_pdf /= N.sum(dist_pdf) 290 291 # XXX can't recall the function... silly 292 # probably could use N.integrate 293 cdf = N.zeros(nbins, dtype=float) 294 #dist_cdf = cdf.copy() 295 dist_prevv = cdf_prevv = 0.0 296 for j in range(nbins): 297 cdf_prevv = cdf[j] = cdf_prevv + pdf[j] 298 #dist_prevv = dist_cdf[j] = dist_prevv + dist_pdf[j] 299 300 # what bins fall into theoretical 'positives' in both tails 301 p = (0.5 - N.abs(dist_cdf - 0.5) < fpp/2.0) 302 303 # amount in empirical tails -- if we match theoretical, we 304 # should have total >= p 305 306 pN_emp = N.sum(pdf[p]) # / (1.0 * nbins) 307 308 if __debug__: 309 debug('TRAN_', "empirical p=%.3f for theoretical p=%.3f" 310 % (pN_emp, fpp)) 311 312 if pN_emp <= fpp: 313 # we are done 314 break 315 316 if pN_emp > pN_emp_prev: 317 if __debug__: 318 debug('TRAN_', "Diverging -- thus keeping last result " 319 "with p=%.3f" % pN_emp_prev) 320 # we better restore previous result 321 indexes, xxx, dist = indexes_prev, xxx_prev, dist_prev 322 break 323 324 pN_emp_prev = pN_emp 325 # very silly way for now -- just proceed by 1 bin 326 keep = N.logical_and(xxx > bins[0], # + bins_halfstep, 327 xxx < bins[-1]) # - bins_halfstep) 328 if __debug__: 329 debug('TRAN_', "Keeping %d out of %d elements" % 330 (N.sum(keep), Nxxx)) 331 332 # Preserve them if we need to "roll back" 333 indexes_prev, xxx_prev, dist_prev = indexes, xxx, dist 334 335 # we should remove those which we assume to be positives and 336 # which should not belong to Null-dist 337 xxx, indexes = xxx[keep], indexes[keep] 338 # L2 renorm it 339 xxx = xxx / N.linalg.norm(xxx) 340 Nxxx = len(xxx) 341 dist = stats.rdist(Nxxx-1, 0, 1) 342 343 Nindexes = len(indexes) 344 Nrecovered = Nxx - Nindexes 345 346 nulldist_number += [Nindexes] 347 positives_recovered += [Nrecovered] 348 349 if __debug__: 350 if distribution == 'rdist': 351 assert(dist.args[0], Nindexes-1) 352 debug('TRAN', "Positives recovery finished with %d out of %d " 353 "entries in Null-distribution, thus %d positives " 354 "were recovered" % (Nindexes, Nxx, Nrecovered)) 355 356 # And now we need to perform our duty -- assign p-values 357 #dist = stats.rdist(Nindexes-1, 0, 1) 358 pvalues[i, :] = dist.cdf(xx) 359 360 # XXX we might add an option to transform it to z-scores? 361 result = pvalues 362 363 # charge state variables 364 # XXX might want to populate them for non-adaptive handling as well 365 self.nulldist_number = nulldist_number 366 self.positives_recovered = positives_recovered 367 368 # transpose if needed 369 if sd == 0: 370 result = result.T 371 372 return result
373