1
2
3
4
5
6
7
8
9 """k-Nearest-Neighbour classifier."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 import numpy as N
15
16 from mvpa.base import warning
17 from mvpa.misc.support import indentDoc
18 from mvpa.clfs.base import Classifier
19 from mvpa.base.dochelpers import enhancedDocString
20 from mvpa.clfs.distance import squared_euclidean_distance
21
22 if __debug__:
23 from mvpa.base import debug
24
25
26 -class kNN(Classifier):
27 """k-Nearest-Neighbour classifier.
28
29 This is a simple classifier that bases its decision on the distances between
30 the training dataset samples and the test sample(s). Distances are computed
31 using a customizable distance function. A certain number (`k`)of nearest
32 neighbors is selected based on the smallest distances and the labels of this
33 neighboring samples are fed into a voting function to determine the labels
34 of the test sample.
35
36 Training a kNN classifier is extremely quick, as no actuall training is
37 performed as the training dataset is simply stored in the classifier. All
38 computations are done during classifier prediction.
39
40 .. note::
41 If enabled, kNN stores the votes per class in the 'values' state after
42 calling predict().
43 """
44
45 _clf_internals = ['knn', 'non-linear', 'binary', 'multiclass',
46 'notrain2predict' ]
47
50 """
51 :Parameters:
52 k: unsigned integer
53 Number of nearest neighbours to be used for voting.
54 dfx: functor
55 Function to compute the distances between training and test samples.
56 Default: squared euclidean distance
57 voting: str
58 Voting method used to derive predictions from the nearest neighbors.
59 Possible values are 'majority' (simple majority of classes
60 determines vote) and 'weighted' (votes are weighted according to the
61 relative frequencies of each class in the training data).
62 **kwargs:
63 Additonal arguments are passed to the base class.
64 """
65
66
67 Classifier.__init__(self, **kwargs)
68
69 self.__k = k
70 self.__dfx = dfx
71 self.__voting = voting
72 self.__data = None
73
74
76 """Representation of the object
77 """
78 return "kNN(k=%d, dfx=%s enable_states=%s)" % \
79 (self.__k, self.__dfx, str(self.states.enabled))
80
81
85
86
88 """Train the classifier.
89
90 For kNN it is degenerate -- just stores the data.
91 """
92 self.__data = data
93 if __debug__:
94 if str(data.samples.dtype).startswith('uint') \
95 or str(data.samples.dtype).startswith('int'):
96 warning("kNN: input data is in integers. " + \
97 "Overflow on arithmetic operations might result in"+\
98 " errors. Please convert dataset's samples into" +\
99 " floating datatype if any error is reported.")
100 self.__weights = None
101
102
103 uniquelabels = data.uniquelabels
104 self.__votes_init = dict(zip(uniquelabels,
105 [0] * len(uniquelabels)))
106
107
109 """Predict the class labels for the provided data.
110
111 Returns a list of class labels (one for each data sample).
112 """
113
114 data = N.asarray(data)
115
116
117 if __debug__:
118 if not data.ndim == 2:
119 raise ValueError, "Data array must be two-dimensional."
120
121 if not data.shape[1] == self.__data.nfeatures:
122 raise ValueError, "Length of data samples (features) does " \
123 "not match the classifier."
124
125
126
127
128 dists = self.__dfx(self.__data.samples, data).T
129
130
131 knns = dists.argsort(axis=1)[:, :self.__k]
132
133
134 predicted = []
135 votes = []
136
137 if self.__voting == 'majority':
138 vfx = self.getMajorityVote
139 elif self.__voting == 'weighted':
140 vfx = self.getWeightedVote
141 else:
142 raise ValueError, "kNN told to perform unknown voting '%s'." % self.__voting
143
144
145 results = [vfx(knn) for knn in knns]
146
147
148 predicted = [r[0] for r in results]
149
150
151
152 self.predictions = predicted
153 self.values = [r[1] for r in results]
154
155 return predicted
156
157
159 """Simple voting by choosing the majority of class neighbours.
160 """
161
162 uniquelabels = self.__data.uniquelabels
163
164
165 knn_labels = N.array([ self.__data.labels[nn] for nn in knn_ids ])
166
167
168 votes = self.__votes_init.copy()
169 for nn in knn_ids:
170 votes[self.__labels[nn]] += 1
171
172
173
174 return uniquelabels[N.asarray(votes).argmax()], \
175 votes
176
177
179 """Vote with classes weighted by the number of samples per class.
180 """
181 uniquelabels = self.__data.uniquelabels
182
183
184 if self.__weights is None:
185
186
187
188
189 self.__labels = self.__data.labels
190 Nlabels = len(self.__labels)
191 Nuniquelabels = len(uniquelabels)
192
193
194
195
196
197
198
199 self.__weights = \
200 [ 1.0 - ((self.__labels == label).sum() / Nlabels) \
201 for label in uniquelabels ]
202 self.__weights = dict(zip(uniquelabels, self.__weights))
203
204
205
206 votes = self.__votes_init.copy()
207 for nn in knn_ids:
208 votes[self.__labels[nn]] += 1
209
210
211 votes = [ self.__weights[ul] * votes[ul] for ul in uniquelabels]
212
213
214
215 return uniquelabels[N.asarray(votes).argmax()], \
216 votes
217
218
220 """Reset trained state"""
221 self.__data = None
222 super(kNN, self).untrain()
223