Mercurial > hg > nsaunier > traffic-intelligence
comparison python/ml.py @ 908:b297525b2cbf
added options to the prototype cluster algorithm, work in progress
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Mon, 26 Jun 2017 00:10:35 -0400 |
| parents | 9fd7b18f75b4 |
| children | 1cd878812529 |
comparison
equal
deleted
inserted
replaced
| 907:9fd7b18f75b4 | 908:b297525b2cbf |
|---|---|
| 129 # k-means | 129 # k-means |
| 130 features = whiten(features) | 130 features = whiten(features) |
| 131 centroids,distortion = kmeans(features,k, iter) | 131 centroids,distortion = kmeans(features,k, iter) |
| 132 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) | 132 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) |
| 133 return code,sigma | 133 return code,sigma |
| 134 | |
| 135 class Cluster: | |
| 136 'Represents a cluster, with a prototype id and the list of instances in cluster' | |
| 137 def __init__(prototypeId, memberIndices = []): | |
| 138 self.prototypeId = prototypeId | |
| 139 self.memberIndices = memberIndices | |
| 134 | 140 |
| 135 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None): | 141 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None): |
| 136 '''Assigns instances to prototypes | 142 '''Assigns instances to prototypes |
| 137 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters | 143 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters |
| 138 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' | 144 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' |
| 159 if assign: | 165 if assign: |
| 160 prototypeIndices.remove(smallestClusterIndex) | 166 prototypeIndices.remove(smallestClusterIndex) |
| 161 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] | 167 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] |
| 162 return prototypeIndices, labels | 168 return prototypeIndices, labels |
| 163 | 169 |
| 164 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, randomInitialization = False, assign = True, initialPrototypeIndices = None): | 170 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, optimizeCentroid = True, randomInitialization = False, assign = True, initialPrototypeIndices = None): |
| 165 '''Finds exemplar (prototype) instance that represent each cluster | 171 '''Finds exemplar (prototype) instance that represent each cluster |
| 166 Returns the prototype indices (in the instances list) and the cluster label of each instance | 172 Returns the prototype indices (in the instances list) and the cluster label of each instance |
| 167 | 173 |
| 168 the elements in the instances list must have a length (method __len__), or one can use the random initialization | 174 the elements in the instances list must have a length (method __len__), or one can use the random initialization |
| 169 the positions in the instances list corresponds to the similarities | 175 the positions in the instances list corresponds to the similarities |
| 172 | 178 |
| 173 if an instance is different enough (<minSimilarity), | 179 if an instance is different enough (<minSimilarity), |
| 174 it will become a new prototype. | 180 it will become a new prototype. |
| 175 Non-prototype instances will be assigned to an existing prototype | 181 Non-prototype instances will be assigned to an existing prototype |
| 176 | 182 |
| 177 TODO: at each step, optimize the prototype as the most similar in its current cluster (can be done easily if similarities are already computed)''' | 183 if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster |
| 178 | 184 |
| 179 # sort instances based on length | 185 TODO: check how similarity evolves in clusters''' |
| 180 if len(instances) == 0: | 186 if len(instances) == 0: |
| 181 print('no instances to cluster (empty list)') | 187 print('no instances to cluster (empty list)') |
| 182 return None | 188 return None |
| 183 | 189 if similarityFunc is None: |
| 190 print('similarityFunc is None') | |
| 191 return None | |
| 192 | |
| 193 # sort instances based on length | |
| 184 indices = range(len(instances)) | 194 indices = range(len(instances)) |
| 185 if randomInitialization: | 195 if randomInitialization or optimizeCentroid: |
| 186 indices = np.random.permutation(indices) | 196 indices = np.random.permutation(indices) |
| 187 else: | 197 else: |
| 188 def compare(i, j): | 198 def compare(i, j): |
| 189 if len(instances[i]) > len(instances[j]): | 199 if len(instances[i]) > len(instances[j]): |
| 190 return -1 | 200 return -1 |
| 192 return 0 | 202 return 0 |
| 193 else: | 203 else: |
| 194 return 1 | 204 return 1 |
| 195 indices.sort(compare) | 205 indices.sort(compare) |
| 196 # go through all instances | 206 # go through all instances |
| 207 clusters = [] | |
| 197 if initialPrototypeIndices is None: | 208 if initialPrototypeIndices is None: |
| 198 prototypeIndices = [indices[0]] | 209 prototypeIndices = [indices[0]] |
| 199 else: | 210 else: |
| 200 prototypeIndices = initialPrototypeIndices | 211 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances |
| 212 for i in prototypeIndices: | |
| 213 clusters.append([i]) | |
| 201 for i in indices[1:]: | 214 for i in indices[1:]: |
| 202 if similarityFunc is not None: | 215 for j in prototypeIndices: |
| 203 for j in prototypeIndices: | 216 if similarities[i][j] < 0: |
| 204 if similarities[i][j] < 0: | 217 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
| 205 similarities[i][j] = similarityFunc(instances[i], instances[j]) | 218 similarities[j][i] = similarities[i][j] |
| 206 similarities[j][i] = similarities[i][j] | 219 label = similarities[i][prototypeIndices].argmax() |
| 207 if similarities[i][prototypeIndices].max() < minSimilarity: | 220 if similarities[i][prototypeIndices[label]] < minSimilarity: |
| 208 prototypeIndices.append(i) | 221 prototypeIndices.append(i) |
| 209 elif randomInitialization: # replace prototype by current instance i if longer | 222 clusters.append([]) |
| 210 label = similarities[i][prototypeIndices].argmax() | 223 else: |
| 211 if len(instances[prototypeIndices[label]]) < len(instances[i]): | 224 clusters[label].append(i) |
| 212 prototypeIndices[label] = i | 225 if optimizeCentroid: |
| 226 if len(clusters[label]) >= 2: # no point if only one element in cluster | |
| 227 for j in clusters[label][:-1]: | |
| 228 if similarities[i][j] < 0: | |
| 229 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
| 230 similarities[j][i] = similarities[i][j] | |
| 231 clusterIndices = clusters[label] | |
| 232 clusterSimilarities = similarities[clusterIndices][:,clusterIndices] | |
| 233 newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()] | |
| 234 if prototypeIndices[label] != newCentroidIdx: | |
| 235 prototypeIndices[label] = newCentroidIdx | |
| 236 elif randomInitialization: # replace prototype by current instance i if longer | |
| 237 if len(instances[prototypeIndices[label]]) < len(instances[i]): | |
| 238 prototypeIndices[label] = i | |
| 239 | |
| 213 if assign: | 240 if assign: |
| 214 return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize) | 241 return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize) |
| 215 else: | 242 else: |
| 216 return prototypeIndices, None | 243 return prototypeIndices, None |
| 217 | 244 |
| 218 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): | 245 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): |
| 219 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | 246 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
| 220 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) | 247 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) |
| 221 return clusterSizes | 248 return clusterSizes |
| 222 | 249 |
| 250 def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1): | |
| 251 if clusters is None: | |
| 252 clusters = {protoId:[] for protoId in prototypeIndices+[-1]} | |
| 253 for i,l in enumerate(labels): | |
| 254 clusters[l].append(i) | |
| 255 clusters = [clusters[protoId] for protoId in prototypeIndices] | |
| 256 for i, cluster in enumerate(clusters): | |
| 257 n = len(cluster) | |
| 258 print('cluster {}: {} elements'.format(prototypeIndices[i], n)) | |
| 259 if n >=2: | |
| 260 for j,k in enumerate(cluster): | |
| 261 for l in cluster[:j]: | |
| 262 if similarities[k][l] < 0: | |
| 263 similarities[k][l] = similarityFunc(instances[k], instances[l]) | |
| 264 similarities[l][k] = similarities[k][l] | |
| 265 print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1))) | |
| 266 print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1)))) | |
| 267 | |
| 223 # Gaussian Mixture Models | 268 # Gaussian Mixture Models |
| 224 def plotGMMClusters(model, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): | 269 def plotGMMClusters(model, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): |
| 225 '''plot the ellipse corresponding to the Gaussians | 270 '''plot the ellipse corresponding to the Gaussians |
| 226 and the predicted classes of the instances in the dataset''' | 271 and the predicted classes of the instances in the dataset''' |
| 227 if fig is None: | 272 if fig is None: |
