1
2
3
4
5
6
7
8
9
10
11 """utility functions for clustering
12
13 """
14
15
17 """returns an ordered list of all nodes below cluster
18
19 the ordering is done using the lengths of the child nodes
20
21 **Arguments**
22
23 - cluster: the cluster in question
24
25 **Returns**
26
27 - a list of the leaves below this cluster
28
29 """
30 if len(cluster) == 1:
31 return [cluster]
32 else:
33 children = cluster.GetChildren()
34 children.sort(key=lambda x: len(x), reverse=True)
35 res = []
36 for child in children:
37 res += GetNodeList(child)
38 res += [cluster]
39 return res
40
41
43 """returns an ordered list of all nodes below cluster
44
45
46 """
47 if hasattr(cluster, '_isCentroid'):
48 cluster._aboveCentroid = 0
49 above = -1
50 else:
51 cluster._aboveCentroid = above
52 if len(cluster) == 1:
53 return [cluster]
54 else:
55 res = []
56 children = cluster.GetChildren()
57 children.sort(key=lambda x: len(x), reverse=True)
58
59 for child in children:
60 res = res + GetNodesDownToCentroids(child, above)
61 res = res + [cluster]
62 return res
63
64
66 """ find the point in a cluster which has the smallest summed
67 Euclidean distance to all others
68
69 **Arguments**
70
71 - cluster: the cluster to work with
72
73 - dists: the distance matrix to use for the points
74
75 **Returns**
76
77 - the index of the centroid point
78
79 """
80 children = cluster.GetPoints()
81 pts = [x.GetData() for x in children]
82
83 best = 1e24
84 bestIdx = -1
85 for pt in pts:
86 dAccum = 0.0
87
88 for other in pts:
89 if other != pt:
90 if other > pt:
91 row, col = pt, other
92 else:
93 row, col = other, pt
94 dAccum += dists[col * (col - 1) / 2 + row]
95 if dAccum >= best:
96
97 break
98 if dAccum < best:
99 best = dAccum
100 bestIdx = pt
101 for i in range(len(pts)):
102 pt = pts[i]
103 if pt != bestIdx:
104 if pt > bestIdx:
105 row, col = bestIdx, pt
106 else:
107 row, col = pt, bestIdx
108 children[i]._distToCenter = dists[col * (col - 1) / 2 + row]
109 else:
110 children[i]._distToCenter = 0.0
111 children[i]._clustCenter = bestIdx
112 cluster._clustCenter = bestIdx
113 cluster._distToCenter = 0.0
114
115 return bestIdx
116
117
119 """ *Internal Use Only*
120
121 """
122 if len(cluster) < n:
123 raise ValueError('Cannot split cluster of length %d into %d pieces' % (len(cluster), n))
124 if len(cluster) == n:
125 return cluster.GetPoints()
126 clusters = [cluster]
127 nxtIdx = 0
128 for _ in range(n - 1):
129 while nxtIdx < len(clusters) and len(clusters[nxtIdx]) == 1:
130 nxtIdx += 1
131 assert nxtIdx < len(clusters)
132
133 children = clusters[nxtIdx].GetChildren()
134 children.sort(key=lambda x: x.GetMetric(), reverse=True)
135 for child in children:
136 clusters.append(child)
137 del clusters[nxtIdx]
138 return clusters
139
140
142 """ *Internal Use Only*
143
144 """
145 if len(cluster) < n:
146 raise ValueError('Cannot split cluster of length %d into %d pieces' % (len(cluster), n))
147 if len(cluster) == n:
148 return cluster.GetPoints()
149 clusters = [cluster]
150 for _ in range(n - 1):
151 nxtIdx = 0
152 while nxtIdx < len(clusters) and len(clusters[nxtIdx]) == 1:
153 nxtIdx += 1
154 assert nxtIdx < len(clusters)
155
156 children = clusters[nxtIdx].GetChildren()
157 for child in children:
158 clusters.append(child)
159 del clusters[nxtIdx]
160 clusters.sort(key=lambda x: x.GetMetric(), reverse=True)
161 return clusters
162
163
165 """ splits a cluster tree into a set of branches
166
167 **Arguments**
168
169 - cluster: the root of the cluster tree
170
171 - n: the number of clusters to include in the split
172
173 - breadthFirst: toggles breadth first (vs depth first) cleavage
174 of the cluster tree.
175
176 **Returns**
177
178 - a list of sub clusters
179
180 """
181 if breadthFirst:
182 return _BreadthFirstSplit(cluster, n)
183 else:
184 return _HeightFirstSplit(cluster, n)
185