Mercurial > hg > nsaunier > traffic-intelligence
comparison trafficintelligence/ml.py @ 1028:cc5cb04b04b0
major update using the trafficintelligence package name and install through pip
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Fri, 15 Jun 2018 11:19:10 -0400 |
| parents | python/ml.py@6ba30b259525 |
| children | 8ffb3ae9f3d2 |
comparison
equal
deleted
inserted
replaced
| 1027:6129296848d3 | 1028:cc5cb04b04b0 |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 '''Libraries for machine learning algorithms''' | |
| 3 | |
| 4 from os import path | |
| 5 from random import shuffle | |
| 6 from copy import copy, deepcopy | |
| 7 | |
| 8 import numpy as np | |
| 9 from matplotlib.pylab import text | |
| 10 import matplotlib as mpl | |
| 11 import matplotlib.pyplot as plt | |
| 12 from scipy.cluster.vq import kmeans, whiten, vq | |
| 13 from sklearn import mixture | |
| 14 try: | |
| 15 import cv2 | |
| 16 opencvAvailable = True | |
| 17 except ImportError: | |
| 18 print('OpenCV library could not be loaded (video replay functions will not be available)') # TODO change to logging module | |
| 19 opencvAvailable = False | |
| 20 | |
| 21 from trafficintelligence import utils | |
| 22 | |
| 23 ##################### | |
| 24 # OpenCV ML models | |
| 25 ##################### | |
| 26 | |
| 27 def computeConfusionMatrix(model, samples, responses): | |
| 28 '''computes the confusion matrix of the classifier (model) | |
| 29 | |
| 30 samples should be n samples by m variables''' | |
| 31 classifications = {} | |
| 32 predictions = model.predict(samples) | |
| 33 for predicted, y in zip(predictions, responses): | |
| 34 classifications[(y, predicted)] = classifications.get((y, predicted), 0)+1 | |
| 35 return classifications | |
| 36 | |
| 37 if opencvAvailable: | |
| 38 class SVM(object): | |
| 39 '''wrapper for OpenCV SimpleVectorMachine algorithm''' | |
| 40 def __init__(self, svmType = cv2.ml.SVM_C_SVC, kernelType = cv2.ml.SVM_RBF, degree = 0, gamma = 1, coef0 = 0, Cvalue = 1, nu = 0, p = 0): | |
| 41 self.model = cv2.ml.SVM_create() | |
| 42 self.model.setType(svmType) | |
| 43 self.model.setKernel(kernelType) | |
| 44 self.model.setDegree(degree) | |
| 45 self.model.setGamma(gamma) | |
| 46 self.model.setCoef0(coef0) | |
| 47 self.model.setC(Cvalue) | |
| 48 self.model.setNu(nu) | |
| 49 self.model.setP(p) | |
| 50 | |
| 51 def save(self, filename): | |
| 52 self.model.save(filename) | |
| 53 | |
| 54 def train(self, samples, layout, responses, computePerformance = False): | |
| 55 self.model.train(samples, layout, responses) | |
| 56 if computePerformance: | |
| 57 return computeConfusionMatrix(self, samples, responses) | |
| 58 | |
| 59 def predict(self, hog): | |
| 60 retval, predictions = self.model.predict(hog) | |
| 61 if hog.shape[0] == 1: | |
| 62 return predictions[0][0] | |
| 63 else: | |
| 64 return np.asarray(predictions, dtype = np.int).ravel().tolist() | |
| 65 | |
| 66 def SVM_load(filename): | |
| 67 if path.exists(filename): | |
| 68 svm = SVM() | |
| 69 svm.model = cv2.ml.SVM_load(filename) | |
| 70 return svm | |
| 71 else: | |
| 72 print('Provided filename {} does not exist: model not loaded!'.format(filename)) | |
| 73 | |
| 74 ##################### | |
| 75 # Clustering | |
| 76 ##################### | |
| 77 | |
| 78 class Centroid(object): | |
| 79 'Wrapper around instances to add a counter' | |
| 80 | |
| 81 def __init__(self, instance, nInstances = 1): | |
| 82 self.instance = instance | |
| 83 self.nInstances = nInstances | |
| 84 | |
| 85 # def similar(instance2): | |
| 86 # return self.instance.similar(instance2) | |
| 87 | |
| 88 def add(self, instance2): | |
| 89 self.instance = self.instance.multiply(self.nInstances)+instance2 | |
| 90 self.nInstances += 1 | |
| 91 self.instance = self.instance.multiply(1/float(self.nInstances)) | |
| 92 | |
| 93 def average(c): | |
| 94 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances) | |
| 95 inst.multiply(1/(self.nInstances+instance.nInstances)) | |
| 96 return Centroid(inst, self.nInstances+instance.nInstances) | |
| 97 | |
| 98 def plot(self, options = ''): | |
| 99 self.instance.plot(options) | |
| 100 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances)) | |
| 101 | |
| 102 def kMedoids(similarityMatrix, initialCentroids = None, k = None): | |
| 103 '''Algorithm that clusters any dataset based on a similarity matrix | |
| 104 Either the initialCentroids or k are passed''' | |
| 105 pass | |
| 106 | |
| 107 def assignCluster(data, similarFunc, initialCentroids = None, shuffleData = True): | |
| 108 '''k-means algorithm with similarity function | |
| 109 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function. | |
| 110 The number of clusters will be determined accordingly | |
| 111 | |
| 112 data: list of instances | |
| 113 averageCentroid: ''' | |
| 114 localdata = copy(data) # shallow copy to avoid modifying data | |
| 115 if shuffleData: | |
| 116 shuffle(localdata) | |
| 117 if initialCentroids is None: | |
| 118 centroids = [Centroid(localdata[0])] | |
| 119 else: | |
| 120 centroids = deepcopy(initialCentroids) | |
| 121 for instance in localdata[1:]: | |
| 122 i = 0 | |
| 123 while i<len(centroids) and not similarFunc(centroids[i].instance, instance): | |
| 124 i += 1 | |
| 125 if i == len(centroids): | |
| 126 centroids.append(Centroid(instance)) | |
| 127 else: | |
| 128 centroids[i].add(instance) | |
| 129 | |
| 130 return centroids | |
| 131 | |
| 132 # TODO recompute centroids for each cluster: instance that minimizes some measure to all other elements | |
| 133 | |
| 134 def spectralClustering(similarityMatrix, k, iter=20): | |
| 135 '''Spectral Clustering algorithm''' | |
| 136 n = len(similarityMatrix) | |
| 137 # create Laplacian matrix | |
| 138 rowsum = np.sum(similarityMatrix,axis=0) | |
| 139 D = np.diag(1 / np.sqrt(rowsum)) | |
| 140 I = np.identity(n) | |
| 141 L = I - np.dot(D,np.dot(similarityMatrix,D)) | |
| 142 # compute eigenvectors of L | |
| 143 U,sigma,V = np.linalg.svd(L) | |
| 144 # create feature vector from k first eigenvectors | |
| 145 # by stacking eigenvectors as columns | |
| 146 features = np.array(V[:k]).T | |
| 147 # k-means | |
| 148 features = whiten(features) | |
| 149 centroids,distortion = kmeans(features,k, iter) | |
| 150 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) | |
| 151 return code,sigma | |
| 152 | |
| 153 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): | |
| 154 '''Assigns instances to prototypes | |
| 155 if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters | |
| 156 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize | |
| 157 | |
| 158 labels are indices in the prototypeIndices''' | |
| 159 if similarityFunc is None: | |
| 160 print('similarityFunc is None') | |
| 161 return None | |
| 162 | |
| 163 indices = [i for i in range(len(instances)) if i not in prototypeIndices] | |
| 164 labels = [-1]*len(instances) | |
| 165 assign = True | |
| 166 while assign: | |
| 167 for i in prototypeIndices: | |
| 168 labels[i] = i | |
| 169 for i in indices: | |
| 170 for j in prototypeIndices: | |
| 171 if similarities[i][j] < 0: | |
| 172 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
| 173 similarities[j][i] = similarities[i][j] | |
| 174 label = similarities[i][prototypeIndices].argmax() | |
| 175 if similarities[i][prototypeIndices[label]] >= minSimilarity: | |
| 176 labels[i] = prototypeIndices[label] | |
| 177 else: | |
| 178 labels[i] = -1 # outlier | |
| 179 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | |
| 180 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) | |
| 181 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) | |
| 182 if assign: | |
| 183 prototypeIndices.remove(smallestClusterIndex) | |
| 184 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] | |
| 185 return prototypeIndices, labels | |
| 186 | |
| 187 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None): | |
| 188 '''Finds exemplar (prototype) instance that represent each cluster | |
| 189 Returns the prototype indices (in the instances list) | |
| 190 | |
| 191 the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid | |
| 192 the positions in the instances list corresponds to the similarities | |
| 193 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) | |
| 194 similarities must still be allocated with the right size | |
| 195 | |
| 196 if an instance is different enough (<minSimilarity), | |
| 197 it will become a new prototype. | |
| 198 Non-prototype instances will be assigned to an existing prototype | |
| 199 | |
| 200 if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster | |
| 201 | |
| 202 initialPrototypeIndices are indices in instances | |
| 203 | |
| 204 TODO: check how similarity evolves in clusters''' | |
| 205 if len(instances) == 0: | |
| 206 print('no instances to cluster (empty list)') | |
| 207 return None | |
| 208 if similarityFunc is None: | |
| 209 print('similarityFunc is None') | |
| 210 return None | |
| 211 | |
| 212 # sort instances based on length | |
| 213 indices = range(len(instances)) | |
| 214 if randomInitialization or optimizeCentroid: | |
| 215 indices = np.random.permutation(indices).tolist() | |
| 216 else: | |
| 217 def compare(i, j): | |
| 218 if len(instances[i]) > len(instances[j]): | |
| 219 return -1 | |
| 220 elif len(instances[i]) == len(instances[j]): | |
| 221 return 0 | |
| 222 else: | |
| 223 return 1 | |
| 224 indices.sort(compare) | |
| 225 # initialize clusters | |
| 226 clusters = [] | |
| 227 if initialPrototypeIndices is None: | |
| 228 prototypeIndices = [indices[0]] | |
| 229 else: | |
| 230 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances | |
| 231 for i in prototypeIndices: | |
| 232 clusters.append([i]) | |
| 233 indices.remove(i) | |
| 234 # go through all instances | |
| 235 for i in indices: | |
| 236 for j in prototypeIndices: | |
| 237 if similarities[i][j] < 0: | |
| 238 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
| 239 similarities[j][i] = similarities[i][j] | |
| 240 label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices | |
| 241 if similarities[i][prototypeIndices[label]] < minSimilarity: | |
| 242 prototypeIndices.append(i) | |
| 243 clusters.append([]) | |
| 244 else: | |
| 245 clusters[label].append(i) | |
| 246 if optimizeCentroid: | |
| 247 if len(clusters[label]) >= 2: # no point if only one element in cluster | |
| 248 for j in clusters[label][:-1]: | |
| 249 if similarities[i][j] < 0: | |
| 250 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
| 251 similarities[j][i] = similarities[i][j] | |
| 252 clusterIndices = clusters[label] | |
| 253 clusterSimilarities = similarities[clusterIndices][:,clusterIndices] | |
| 254 newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()] | |
| 255 if prototypeIndices[label] != newCentroidIdx: | |
| 256 prototypeIndices[label] = newCentroidIdx | |
| 257 elif len(instances[prototypeIndices[label]]) < len(instances[i]): # replace prototype by current instance i if longer # otherwise, possible to test if randomInitialization or initialPrototypes is not None | |
| 258 prototypeIndices[label] = i | |
| 259 return prototypeIndices | |
| 260 | |
| 261 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): | |
| 262 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | |
| 263 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) | |
| 264 return clusterSizes | |
| 265 | |
| 266 def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1): | |
| 267 if clusters is None: | |
| 268 clusters = {protoId:[] for protoId in prototypeIndices+[-1]} | |
| 269 for i,l in enumerate(labels): | |
| 270 clusters[l].append(i) | |
| 271 clusters = [clusters[protoId] for protoId in prototypeIndices] | |
| 272 for i, cluster in enumerate(clusters): | |
| 273 n = len(cluster) | |
| 274 print('cluster {}: {} elements'.format(prototypeIndices[i], n)) | |
| 275 if n >=2: | |
| 276 for j,k in enumerate(cluster): | |
| 277 for l in cluster[:j]: | |
| 278 if similarities[k][l] < 0: | |
| 279 similarities[k][l] = similarityFunc(instances[k], instances[l]) | |
| 280 similarities[l][k] = similarities[k][l] | |
| 281 print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1))) | |
| 282 print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1)))) | |
| 283 | |
| 284 # Gaussian Mixture Models | |
| 285 def plotGMM(mean, covariance, gmmId, fig, color, alpha = 0.3): | |
| 286 v, w = np.linalg.eigh(covariance) | |
| 287 angle = 180*np.arctan2(w[0][1], w[0][0])/np.pi | |
| 288 v *= 4 | |
| 289 ell = mpl.patches.Ellipse(mean, v[0], v[1], 180+angle, color=color) | |
| 290 ell.set_clip_box(fig.bbox) | |
| 291 ell.set_alpha(alpha) | |
| 292 fig.axes[0].add_artist(ell) | |
| 293 plt.plot([mean[0]], [mean[1]], 'x'+color) | |
| 294 plt.annotate(str(gmmId), xy=(mean[0]+1, mean[1]+1)) | |
| 295 | |
| 296 def plotGMMClusters(model, labels = None, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): | |
| 297 '''plot the ellipse corresponding to the Gaussians | |
| 298 and the predicted classes of the instances in the dataset''' | |
| 299 if fig is None: | |
| 300 fig = plt.figure() | |
| 301 if len(fig.get_axes()) == 0: | |
| 302 fig.add_subplot(111) | |
| 303 for i in range(model.n_components): | |
| 304 mean = model.means_[i]/nUnitsPerPixel | |
| 305 covariance = model.covariances_[i]/nUnitsPerPixel | |
| 306 # plot points | |
| 307 if dataset is not None: | |
| 308 tmpDataset = dataset/nUnitsPerPixel | |
| 309 plt.scatter(tmpDataset[labels == i, 0], tmpDataset[labels == i, 1], .8, color=colors[i]) | |
| 310 # plot an ellipse to show the Gaussian component | |
| 311 plotGMM(mean, covariance, i, fig, colors[i], alpha) | |
| 312 if dataset is None: # to address issues without points, the axes limits are not redrawn | |
| 313 minima = model.means_.min(0) | |
| 314 maxima = model.means_.max(0) | |
| 315 xwidth = 0.5*(maxima[0]-minima[0]) | |
| 316 ywidth = 0.5*(maxima[1]-minima[1]) | |
| 317 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) | |
| 318 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) | |
| 319 | |
| 320 if __name__ == "__main__": | |
| 321 import doctest | |
| 322 import unittest | |
| 323 suite = doctest.DocFileSuite('tests/ml.txt') | |
| 324 unittest.TextTestRunner().run(suite) | |
| 325 # #doctest.testmod() | |
| 326 # #doctest.testfile("example.txt") |
