diff python/ml.py @ 746:e7ff0f60fef8

merged new developments (indicator and trajectory clustering)
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Thu, 10 Sep 2015 15:52:45 -0400
parents 2472b4d59aea
children 1f2b2d1f4fbf
line wrap: on
line diff
--- a/python/ml.py	Mon Aug 10 01:06:59 2015 -0400
+++ b/python/ml.py	Thu Sep 10 15:52:45 2015 -0400
@@ -111,13 +111,73 @@
 	code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
 	return code,sigma	
 
-def motionPatterLearning(objects, maxDistance):
-    ''' 
-    Option to use only the (n?) longest features per object instead of all for speed up
-    TODO'''
-    pass
+def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = None, randomInitialization = False):
+    '''Finds exemplar (prototype) instance that represent each cluster
+    Returns the prototype indices (in the instances list) and the cluster label of each instance
+
+    the elements in the instances list must have a length (method __len__), or one can use the random initialization
+    the positions in the instances list corresponds to the similarities
+    if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed)
+    similarities must still be allocated with the right size
+
+    if an instance is different enough (<minSimilarity), 
+    it will become a new prototype. 
+    Non-prototype instances will be assigned to an existing prototype
+    if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters
+    and reassigning all elements in the cluster until no cluster is smaller than minClusterSize'''
 
-def prototypeCluster():
-    ''' 
-    TODO'''
-    pass
+    # sort instances based on length
+    indices = range(len(instances))
+    if randomInitialization:
+        indices = np.random.permutation(indices)
+    else:
+        def compare(i, j):
+            if len(instances[i]) > len(instances[j]):
+                return -1
+            elif len(instances[i]) == len(instances[j]):
+                return 0
+            else:
+                return 1
+        indices.sort(compare)
+    # go through all instances
+    prototypeIndices = [indices[0]]
+    for i in indices[1:]:
+        if similarityFunc is not None:
+            for j in prototypeIndices:
+                if similarities[i][j] < 0:
+                    similarities[i][j] = similarityFunc(instances[i], instances[j])
+                    similarities[j][i] = similarities[i][j]
+        if similarities[i][prototypeIndices].max() < minSimilarity:
+             prototypeIndices.append(i)
+
+    # assignment
+    indices = [i for i in range(similarities.shape[0]) if i not in prototypeIndices]
+    assign = True
+    while assign:
+        labels = [-1]*similarities.shape[0]
+        for i in prototypeIndices:
+            labels[i] = i
+        for i in indices:
+            if similarityFunc is not None:
+                for j in prototypeIndices:
+                    if similarities[i][j] < 0:
+                        similarities[i][j] = similarityFunc(instances[i], instances[j])
+                        similarities[j][i] = similarities[i][j]
+            prototypeIdx = similarities[i][prototypeIndices].argmax()
+            if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity:
+                labels[i] = prototypeIndices[prototypeIdx]
+            else:
+                labels[i] = -1 # outlier
+        clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
+        smallestClusterIndex = min(clusterSizes, key = clusterSizes.get)
+        assign = (clusterSizes[smallestClusterIndex] < minClusterSize)
+        if assign:
+            prototypeIndices.remove(smallestClusterIndex)
+            indices.append(smallestClusterIndex)
+
+    return prototypeIndices, labels
+
+def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1):
+    clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
+    clusterSizes['outlier'] = sum(np.array(labels) == -1)
+    return clusterSizes