| Home | Trees | Indices | Help |
|
|---|
|
|
1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
2 # vi: set ft=python sts=4 ts=4 sw=4 et:
3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
4 #
5 # See COPYING file distributed along with the PyMVPA package for the
6 # copyright and license terms.
7 #
8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
9 """Little statistics helper"""
10
11 __docformat__ = 'restructuredtext'
12
13 from mvpa.base import externals
14
15 if externals.exists('scipy', raiseException=True):
16 import scipy.stats as stats
17
18 import numpy as N
19 import copy
20
22 """Compute the chisquare value of a contingency table with arbitrary
23 dimensions.
24
25 If no expected frequencies are supplied, the total N is assumed to be
26 equally distributed across all cells.
27
28 Returns: chisquare-stats, associated p-value (upper tail)
29 """
30 obs = N.array(obs)
31
32 # get total number of observations
33 nobs = N.sum(obs)
34
35 # if no expected value are supplied assume equal distribution
36 if exp == None:
37 exp = N.ones(obs.shape) * nobs / N.prod(obs.shape)
38
39 # make sure to have floating point data
40 exp = exp.astype(float)
41
42 # compute chisquare value
43 chisq = N.sum((obs - exp )**2 / exp)
44
45 # return chisq and probability (upper tail)
46 return chisq, stats.chisqprob(chisq, N.prod(obs.shape) - 1)
47
48
50 """DSMatrix allows for the creation of dissilimarity matrices using
51 arbitrary distance metrics.
52 """
53
54 # metric is a string
56 """Initialize DSMatrix
57
58 :Parameters:
59 data_vectors : ndarray
60 m x n collection of vectors, where m is the number of exemplars
61 and n is the number of features per exemplar
62 metric : string
63 Distance metric to use (e.g., 'euclidean', 'spearman', 'pearson',
64 'confusion')
65 """
66 # init members
67 self.full_matrix = []
68 self.u_triangle = None
69 self.vector_form = None
70
71 # this one we know straight away, so set it
72 self.metric = metric
73
74 # size of dataset (checking if we're dealing with a column vector only)
75 num_exem = N.shape(data_vectors)[0]
76 flag_1d = False
77 # changed 4/26/09 to new way of figuring out if array is 1-D
78 #if (isinstance(data_vectors, N.ndarray)):
79 if (not(num_exem == N.size(data_vectors))):
80 num_features = N.shape(data_vectors)[1]
81 else:
82 flag_1d = True
83 num_features = 1
84
85 # generate output (dissimilarity) matrix
86 dsmatrix = N.mat(N.zeros((num_exem, num_exem)))
87
88 if (metric == 'euclidean'):
89 #print 'Using Euclidean distance metric...'
90 # down rows
91 for i in range(num_exem):
92 # across columns
93 for j in range(num_exem):
94 if (not(flag_1d)):
95 dsmatrix[i,j] = N.linalg.norm(data_vectors[i,:] - data_vectors[j,:])
96 else:
97 dsmatrix[i,j] = N.linalg.norm(data_vectors[i] - data_vectors[j])
98
99 elif (metric == 'spearman'):
100 #print 'Using Spearman rank-correlation metric...'
101 # down rows
102 for i in range(num_exem):
103 # across columns
104 for j in range(num_exem):
105 dsmatrix[i,j] = 1 - stats.spearmanr(data_vectors[i,:],data_vectors[j,:])[0]
106
107 elif (metric == 'pearson'):
108 #print 'Using Pearson correlation metric...'
109 # down rows
110 for i in range(num_exem):
111 # across columns
112 for j in range(num_exem):
113 dsmatrix[i, j] = 1 - stats.pearsonr(
114 data_vectors[i,:],data_vectors[j,:])[0]
115
116 elif (metric == 'confusion'):
117 #print 'Using confusion correlation metric...'
118 # down rows
119 for i in range(num_exem):
120 # across columns
121 for j in range(num_exem):
122 if (not(flag_1d)):
123 dsmatrix[i, j] = 1 - int(
124 N.floor(N.sum((
125 data_vectors[i, :] == data_vectors[j, :]
126 ).astype(N.int32)) / num_features))
127 else:
128 dsmatrix[i, j] = 1 - int(
129 data_vectors[i] == data_vectors[j])
130
131 self.full_matrix = dsmatrix
132
134 # if we need to create the u_triangle representation, do so
135 if (self.u_triangle == None):
136 self.u_triangle = N.triu(self.full_matrix)
137
138 return self.u_triangle
139
140 # create the dissimilarity matrix on the (upper) triangle of the two
141 # two dissimilarity matrices; we can just reuse the same dissimilarity
142 # matrix code, but since it will return a matrix, we need to pick out
143 # either dsm[0,1] or dsm[1,0]
144 # note: this is a bit of a kludge right now, but it's the only way to solve
145 # certain problems:
146 # 1. Set all 0-valued elements in the original matrix to -1 (an impossible
147 # value for a dissimilarity matrix)
148 # 2. Find the upper triangle of the matrix
149 # 3. Create a vector from the upper triangle, but only with the
150 # elements whose absolute value is greater than 0 -- this
151 # will keep everything from the original matrix that wasn't
152 # part of the zero'ed-out portion when we took the upper
153 # triangle
154 # 4. Set all the -1-valued elements in the vector to 0 (their
155 # original value)
156 # 5. Cast to numpy array
158 if (not(self.vector_form == None)):
159 return self.vector_form
160
161 orig_dsmatrix = copy.deepcopy(self.getFullMatrix())
162
163 orig_dsmatrix[orig_dsmatrix == 0] = -1
164
165 orig_tri = N.triu(orig_dsmatrix)
166
167 self.vector_form = orig_tri[abs(orig_tri) > 0]
168
169 self.vector_form[self.vector_form == -1] = 0
170
171 self.vector_form = N.asarray(self.vector_form)
172 self.vector_form = self.vector_form[0]
173
174 return self.vector_form
175
176 # XXX is there any reason to have these get* methods
177 # instead of plain access to full_matrix and method?
180
182 return self.metric
183
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Sun Sep 6 14:24:14 2009 | http://epydoc.sourceforge.net |