Package mvpa :: Package clfs :: Module knn
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.knn

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """k-Nearest-Neighbour classifier.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13   
 14  import numpy as N 
 15   
 16  from mvpa.base import warning 
 17  from mvpa.misc.support import indentDoc 
 18  from mvpa.clfs.base import Classifier 
 19  from mvpa.base.dochelpers import enhancedDocString 
 20  from mvpa.clfs.distance import squared_euclidean_distance 
 21   
 22  if __debug__: 
 23      from mvpa.base import debug 
 24   
 25   
26 -class kNN(Classifier):
27 """k-Nearest-Neighbour classifier. 28 29 This is a simple classifier that bases its decision on the distances between 30 the training dataset samples and the test sample(s). Distances are computed 31 using a customizable distance function. A certain number (`k`)of nearest 32 neighbors is selected based on the smallest distances and the labels of this 33 neighboring samples are fed into a voting function to determine the labels 34 of the test sample. 35 36 Training a kNN classifier is extremely quick, as no actuall training is 37 performed as the training dataset is simply stored in the classifier. All 38 computations are done during classifier prediction. 39 40 .. note:: 41 If enabled, kNN stores the votes per class in the 'values' state after 42 calling predict(). 43 """ 44 45 _clf_internals = ['knn', 'non-linear', 'binary', 'multiclass', 46 'notrain2predict' ] 47
48 - def __init__(self, k=2, dfx=squared_euclidean_distance, 49 voting='weighted', **kwargs):
50 """ 51 :Parameters: 52 k: unsigned integer 53 Number of nearest neighbours to be used for voting. 54 dfx: functor 55 Function to compute the distances between training and test samples. 56 Default: squared euclidean distance 57 voting: str 58 Voting method used to derive predictions from the nearest neighbors. 59 Possible values are 'majority' (simple majority of classes 60 determines vote) and 'weighted' (votes are weighted according to the 61 relative frequencies of each class in the training data). 62 **kwargs: 63 Additonal arguments are passed to the base class. 64 """ 65 66 # init base class first 67 Classifier.__init__(self, **kwargs) 68 69 self.__k = k 70 self.__dfx = dfx 71 self.__voting = voting 72 self.__data = None
73 74
75 - def __repr__(self):
76 """Representation of the object 77 """ 78 return "kNN(k=%d, dfx=%s enable_states=%s)" % \ 79 (self.__k, self.__dfx, str(self.states.enabled))
80 81
82 - def __str__(self):
83 return "%s\n data: %s" % \ 84 (Classifier.__str__(self), indentDoc(self.__data))
85 86
87 - def _train(self, data):
88 """Train the classifier. 89 90 For kNN it is degenerate -- just stores the data. 91 """ 92 self.__data = data 93 if __debug__: 94 if str(data.samples.dtype).startswith('uint') \ 95 or str(data.samples.dtype).startswith('int'): 96 warning("kNN: input data is in integers. " + \ 97 "Overflow on arithmetic operations might result in"+\ 98 " errors. Please convert dataset's samples into" +\ 99 " floating datatype if any error is reported.") 100 self.__weights = None 101 102 # create dictionary with an item for each condition 103 uniquelabels = data.uniquelabels 104 self.__votes_init = dict(zip(uniquelabels, 105 [0] * len(uniquelabels)))
106 107
108 - def _predict(self, data):
109 """Predict the class labels for the provided data. 110 111 Returns a list of class labels (one for each data sample). 112 """ 113 # make sure we're talking about arrays 114 data = N.asarray(data) 115 116 # checks only in debug mode 117 if __debug__: 118 if not data.ndim == 2: 119 raise ValueError, "Data array must be two-dimensional." 120 121 if not data.shape[1] == self.__data.nfeatures: 122 raise ValueError, "Length of data samples (features) does " \ 123 "not match the classifier." 124 125 # compute the distance matrix between training and test data with 126 # distances stored row-wise, ie. distances between test sample [0] 127 # and all training samples will end up in row 0 128 dists = self.__dfx(self.__data.samples, data).T 129 130 # determine the k nearest neighbors per test sample 131 knns = dists.argsort(axis=1)[:, :self.__k] 132 133 # predicted class labels will go here 134 predicted = [] 135 votes = [] 136 137 if self.__voting == 'majority': 138 vfx = self.getMajorityVote 139 elif self.__voting == 'weighted': 140 vfx = self.getWeightedVote 141 else: 142 raise ValueError, "kNN told to perform unknown voting '%s'." % self.__voting 143 144 # perform voting 145 results = [vfx(knn) for knn in knns] 146 147 # extract predictions 148 predicted = [r[0] for r in results] 149 150 # store the predictions in the state. Relies on State._setitem to do 151 # nothing if the relevant state member is not enabled 152 self.predictions = predicted 153 self.values = [r[1] for r in results] 154 155 return predicted
156 157
158 - def getMajorityVote(self, knn_ids):
159 """Simple voting by choosing the majority of class neighbours. 160 """ 161 162 uniquelabels = self.__data.uniquelabels 163 164 # translate knn ids into class labels 165 knn_labels = N.array([ self.__data.labels[nn] for nn in knn_ids ]) 166 167 # number of occerences for each unique class in kNNs 168 votes = self.__votes_init.copy() 169 for nn in knn_ids: 170 votes[self.__labels[nn]] += 1 171 172 # find the class with most votes 173 # return votes as well to store them in the state 174 return uniquelabels[N.asarray(votes).argmax()], \ 175 votes
176 177
178 - def getWeightedVote(self, knn_ids):
179 """Vote with classes weighted by the number of samples per class. 180 """ 181 uniquelabels = self.__data.uniquelabels 182 183 # Lazy evaluation 184 if self.__weights is None: 185 # 186 # It seemed to Yarik that this has to be evaluated just once per 187 # training dataset. 188 # 189 self.__labels = self.__data.labels 190 Nlabels = len(self.__labels) 191 Nuniquelabels = len(uniquelabels) 192 193 # TODO: To get proper speed up for the next line only, 194 # histogram should be computed 195 # via sorting + counting "same" elements while reducing. 196 # Guaranteed complexity is NlogN whenever now it is N^2 197 # compute the relative proportion of samples belonging to each 198 # class (do it in one loop to improve speed and reduce readability 199 self.__weights = \ 200 [ 1.0 - ((self.__labels == label).sum() / Nlabels) \ 201 for label in uniquelabels ] 202 self.__weights = dict(zip(uniquelabels, self.__weights)) 203 204 205 # number of occerences for each unique class in kNNs 206 votes = self.__votes_init.copy() 207 for nn in knn_ids: 208 votes[self.__labels[nn]] += 1 209 210 # weight votes 211 votes = [ self.__weights[ul] * votes[ul] for ul in uniquelabels] 212 213 # find the class with most votes 214 # return votes as well to store them in the state 215 return uniquelabels[N.asarray(votes).argmax()], \ 216 votes
217 218
219 - def untrain(self):
220 """Reset trained state""" 221 self.__data = None 222 super(kNN, self).untrain()
223