Mercurial > hg > nsaunier > traffic-intelligence
comparison python/ml.py @ 735:0e875a7f5759 dev
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Wed, 12 Aug 2015 00:24:06 -0400 |
| parents | 1d4dcb5c8708 |
| children | 2472b4d59aea |
comparison
equal
deleted
inserted
replaced
| 734:1d4dcb5c8708 | 735:0e875a7f5759 |
|---|---|
| 109 features = whiten(features) | 109 features = whiten(features) |
| 110 centroids,distortion = kmeans(features,k, iter) | 110 centroids,distortion = kmeans(features,k, iter) |
| 111 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) | 111 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) |
| 112 return code,sigma | 112 return code,sigma |
| 113 | 113 |
| 114 def prototypeCluster(instances, similarityMatrix, minSimilarity, minClusterSize = None, randomInitialization = False): | 114 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = None, randomInitialization = False): |
| 115 '''Finds exemplar (prototype) instance that represent each cluster | 115 '''Finds exemplar (prototype) instance that represent each cluster |
| 116 Returns the prototype indices (in the instances list) and the cluster label of each instance | 116 Returns the prototype indices (in the instances list) and the cluster label of each instance |
| 117 | 117 |
| 118 the elements in the instances list must have a length (method __len__), or one can use the random initialization | 118 the elements in the instances list must have a length (method __len__), or one can use the random initialization |
| 119 the positions in the instances list corresponds to the similarityMatrix | 119 the positions in the instances list corresponds to the similarities |
| 120 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) | |
| 121 similarities must still be allocated with the right size | |
| 120 | 122 |
| 121 if an instance is different enough (<minSimilarity), | 123 if an instance is different enough (<minSimilarity), |
| 122 it will become a new prototype. | 124 it will become a new prototype. |
| 123 Non-prototype instances will be assigned to an existing prototype | 125 Non-prototype instances will be assigned to an existing prototype |
| 124 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters | 126 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters |
| 125 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' | 127 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' |
| 126 | 128 |
| 127 # sort instances based on length | 129 # sort instances based on length |
| 128 indices = range(similarityMatrix.shape[0]) | 130 indices = range(len(instances)) |
| 129 if randomInitialization: | 131 if randomInitialization: |
| 130 indices = np.random.permutation(indices) | 132 indices = np.random.permutation(indices) |
| 131 else: | 133 else: |
| 132 def compare(i, j): | 134 def compare(i, j): |
| 133 if len(instances[i]) > len(instances[j]): | 135 if len(instances[i]) > len(instances[j]): |
| 138 return 1 | 140 return 1 |
| 139 indices.sort(compare) | 141 indices.sort(compare) |
| 140 # go through all instances | 142 # go through all instances |
| 141 prototypeIndices = [indices[0]] | 143 prototypeIndices = [indices[0]] |
| 142 for i in indices[1:]: | 144 for i in indices[1:]: |
| 143 if similarityMatrix[i][prototypeIndices].max() < minSimilarity: | 145 if similarityFunc is not None: |
| 146 for j in prototypeIndices: | |
| 147 if similarities[i][j] < 0: | |
| 148 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
| 149 similarities[j][i] = similarities[i][j] | |
| 150 if similarities[i][prototypeIndices].max() < minSimilarity: | |
| 144 prototypeIndices.append(i) | 151 prototypeIndices.append(i) |
| 145 | 152 |
| 146 # assignment | 153 # assignment |
| 147 indices = [i for i in range(similarityMatrix.shape[0]) if i not in prototypeIndices] | 154 indices = [i for i in range(similarities.shape[0]) if i not in prototypeIndices] |
| 148 assign = True | 155 assign = True |
| 149 while assign: | 156 while assign: |
| 150 labels = [-1]*similarityMatrix.shape[0] | 157 labels = [-1]*similarities.shape[0] |
| 151 for i in prototypeIndices: | 158 for i in prototypeIndices: |
| 152 labels[i] = i | 159 labels[i] = i |
| 153 for i in indices: | 160 for i in indices: |
| 154 prototypeIndex = similarityMatrix[i][prototypeIndices].argmax() | 161 if similarityFunc is not None: |
| 155 labels[i] = prototypeIndices[prototypeIndex] | 162 for j in prototypeIndices: |
| 163 if similarities[i][j] < 0: | |
| 164 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
| 165 similarities[j][i] = similarities[i][j] | |
| 166 prototypeIdx = similarities[i][prototypeIndices].argmax() | |
| 167 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity: | |
| 168 labels[i] = prototypeIndices[prototypeIdx] | |
| 169 else: | |
| 170 labels[i] = -1 # outlier | |
| 156 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | 171 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
| 157 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) | 172 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) |
| 158 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) | 173 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) |
| 159 if assign: | 174 if assign: |
| 160 prototypeIndices.remove(smallestClusterIndex) | 175 prototypeIndices.remove(smallestClusterIndex) |
| 161 indices.append(smallestClusterIndex) | 176 indices.append(smallestClusterIndex) |
| 162 | 177 |
