Mercurial > hg > nsaunier > traffic-intelligence
comparison python/ml.py @ 980:23f98ebb113f
first tests for clustering algo
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Mon, 19 Feb 2018 16:32:59 -0500 |
| parents | 184f1dd307f9 |
| children | e8eabef7857c |
comparison
equal
deleted
inserted
replaced
| 979:cc89267b5ff9 | 980:23f98ebb113f |
|---|---|
| 154 self.prototypeId = prototypeId | 154 self.prototypeId = prototypeId |
| 155 self.memberIndices = memberIndices | 155 self.memberIndices = memberIndices |
| 156 | 156 |
| 157 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): | 157 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): |
| 158 '''Assigns instances to prototypes | 158 '''Assigns instances to prototypes |
| 159 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters | 159 if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters |
| 160 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' | 160 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize |
| 161 | |
| 162 labels are indices in the prototypeIndices''' | |
| 163 if similarityFunc is None: | |
| 164 print('similarityFunc is None') | |
| 165 return None | |
| 166 | |
| 161 indices = [i for i in range(len(instances)) if i not in prototypeIndices] | 167 indices = [i for i in range(len(instances)) if i not in prototypeIndices] |
| 162 labels = [-1]*len(instances) | 168 labels = [-1]*len(instances) |
| 163 assign = True | 169 assign = True |
| 164 while assign: | 170 while assign: |
| 165 for i in prototypeIndices: | 171 for i in prototypeIndices: |
| 166 labels[i] = i | 172 labels[i] = i |
| 167 for i in indices: | 173 for i in indices: |
| 168 if similarityFunc is not None: | 174 for j in prototypeIndices: |
| 169 for j in prototypeIndices: | 175 if similarities[i][j] < 0: |
| 170 if similarities[i][j] < 0: | 176 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
| 171 similarities[i][j] = similarityFunc(instances[i], instances[j]) | 177 similarities[j][i] = similarities[i][j] |
| 172 similarities[j][i] = similarities[i][j] | 178 label = similarities[i][prototypeIndices].argmax() |
| 173 prototypeIdx = similarities[i][prototypeIndices].argmax() | 179 if similarities[i][prototypeIndices[label]] >= minSimilarity: |
| 174 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity: | 180 labels[i] = prototypeIndices[label] |
| 175 labels[i] = prototypeIndices[prototypeIdx] | |
| 176 else: | 181 else: |
| 177 labels[i] = -1 # outlier | 182 labels[i] = -1 # outlier |
| 178 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | 183 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
| 179 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) | 184 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) |
| 180 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) | 185 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) |
| 181 if assign: | 186 if assign: |
| 182 prototypeIndices.remove(smallestClusterIndex) | 187 prototypeIndices.remove(smallestClusterIndex) |
| 183 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] | 188 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] |
| 184 return prototypeIndices, labels | 189 return prototypeIndices, labels |
| 185 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = True, randomInitialization = False, initialPrototypeIndices = None): | 190 |
| 191 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None): | |
| 186 '''Finds exemplar (prototype) instance that represent each cluster | 192 '''Finds exemplar (prototype) instance that represent each cluster |
| 187 Returns the prototype indices (in the instances list) | 193 Returns the prototype indices (in the instances list) |
| 188 | 194 |
| 189 the elements in the instances list must have a length (method __len__), or one can use the random initialization | 195 the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid |
| 190 the positions in the instances list corresponds to the similarities | 196 the positions in the instances list corresponds to the similarities |
| 191 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) | 197 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) |
| 192 similarities must still be allocated with the right size | 198 similarities must still be allocated with the right size |
| 193 | 199 |
| 194 if an instance is different enough (<minSimilarity), | 200 if an instance is different enough (<minSimilarity), |
| 208 return None | 214 return None |
| 209 | 215 |
| 210 # sort instances based on length | 216 # sort instances based on length |
| 211 indices = range(len(instances)) | 217 indices = range(len(instances)) |
| 212 if randomInitialization or optimizeCentroid: | 218 if randomInitialization or optimizeCentroid: |
| 213 indices = np.random.permutation(indices) | 219 indices = np.random.permutation(indices).tolist() |
| 214 else: | 220 else: |
| 215 def compare(i, j): | 221 def compare(i, j): |
| 216 if len(instances[i]) > len(instances[j]): | 222 if len(instances[i]) > len(instances[j]): |
| 217 return -1 | 223 return -1 |
| 218 elif len(instances[i]) == len(instances[j]): | 224 elif len(instances[i]) == len(instances[j]): |
| 219 return 0 | 225 return 0 |
| 220 else: | 226 else: |
| 221 return 1 | 227 return 1 |
| 222 indices.sort(compare) | 228 indices.sort(compare) |
| 223 # go through all instances | 229 # initialize clusters |
| 224 clusters = [] | 230 clusters = [] |
| 225 if initialPrototypeIndices is None: | 231 if initialPrototypeIndices is None: |
| 226 prototypeIndices = [indices[0]] | 232 prototypeIndices = [indices[0]] |
| 227 else: | 233 else: |
| 228 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances | 234 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances |
| 229 for i in prototypeIndices: | 235 for i in prototypeIndices: |
| 230 clusters.append([i]) | 236 clusters.append([i]) |
| 231 indices.remove(i) | 237 indices.remove(i) |
| 238 # go through all instances | |
| 232 for i in indices: | 239 for i in indices: |
| 233 for j in prototypeIndices: | 240 for j in prototypeIndices: |
| 234 if similarities[i][j] < 0: | 241 if similarities[i][j] < 0: |
| 235 similarities[i][j] = similarityFunc(instances[i], instances[j]) | 242 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
| 236 similarities[j][i] = similarities[i][j] | 243 similarities[j][i] = similarities[i][j] |
| 237 label = similarities[i][prototypeIndices].argmax() | 244 label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices |
| 238 if similarities[i][prototypeIndices[label]] < minSimilarity: | 245 if similarities[i][prototypeIndices[label]] < minSimilarity: |
| 239 prototypeIndices.append(i) | 246 prototypeIndices.append(i) |
| 240 clusters.append([]) | 247 clusters.append([]) |
| 241 else: | 248 else: |
| 242 clusters[label].append(i) | 249 clusters[label].append(i) |
| 311 maxima = model.means_.max(0) | 318 maxima = model.means_.max(0) |
| 312 xwidth = 0.5*(maxima[0]-minima[0]) | 319 xwidth = 0.5*(maxima[0]-minima[0]) |
| 313 ywidth = 0.5*(maxima[1]-minima[1]) | 320 ywidth = 0.5*(maxima[1]-minima[1]) |
| 314 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) | 321 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) |
| 315 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) | 322 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) |
| 323 | |
| 324 if __name__ == "__main__": | |
| 325 import doctest | |
| 326 import unittest | |
| 327 suite = doctest.DocFileSuite('tests/ml.txt') | |
| 328 unittest.TextTestRunner().run(suite) | |
| 329 # #doctest.testmod() | |
| 330 # #doctest.testfile("example.txt") |
