1
2
3
4
5
6 """ Automatic search for quantization bounds
7
8 This uses the expected informational gain to determine where quantization bounds should
9 lie.
10
11 **Notes**:
12
13 - bounds are less than, so if the bounds are [1.,2.],
14 [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2]
15
16 """
17 from __future__ import print_function
18 import numpy
19 from rdkit.ML.InfoTheory import entropy
20 from rdkit.six.moves import zip, map, range
21 try:
22 from rdkit.ML.Data import cQuantize
23 except ImportError:
24 hascQuantize = 0
25 else:
26 hascQuantize = 1
27
28 _float_tol = 1e-8
29
30
32 """ floating point equality with a tolerance factor
33
34 **Arguments**
35
36 - v1: a float
37
38 - v2: a float
39
40 - tol: the tolerance for comparison
41
42 **Returns**
43
44 0 or 1
45
46 """
47 return abs(v1 - v2) < tol
48
49
51 """ Uses FindVarMultQuantBounds, only here for historic reasons
52 """
53 bounds, gain = FindVarMultQuantBounds(vals, 1, results, nPossibleRes)
54 return (bounds[0], gain)
55
56
58 """ Primarily intended for internal use
59
60 constructs a variable table for the data passed in
61 The table for a given variable records the number of times each possible value
62 of that variable appears for each possible result of the function.
63
64 **Arguments**
65
66 - vals: a 1D Numeric array with the values of the variables
67
68 - cuts: a list with the indices of the quantization bounds
69 (indices are into _starts_ )
70
71 - starts: a list of potential starting points for quantization bounds
72
73 - results: a 1D Numeric array of integer result codes
74
75 - nPossibleRes: an integer with the number of possible result codes
76
77 **Returns**
78
79 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes
80
81 **Notes**
82
83 - _vals_ should be sorted!
84
85 """
86 nVals = len(cuts) + 1
87 varTable = numpy.zeros((nVals, nPossibleRes), 'i')
88 idx = 0
89 for i in range(nVals - 1):
90 cut = cuts[i]
91 while idx < starts[cut]:
92 varTable[i, results[idx]] += 1
93 idx += 1
94 while idx < len(vals):
95 varTable[-1, results[idx]] += 1
96 idx += 1
97 return varTable
98
99
101 """ Primarily intended for internal use
102
103 Recursively finds the best quantization boundaries
104
105 **Arguments**
106
107 - vals: a 1D Numeric array with the values of the variables,
108 this should be sorted
109
110 - cuts: a list with the indices of the quantization bounds
111 (indices are into _starts_ )
112
113 - which: an integer indicating which bound is being adjusted here
114 (and index into _cuts_ )
115
116 - starts: a list of potential starting points for quantization bounds
117
118 - results: a 1D Numeric array of integer result codes
119
120 - nPossibleRes: an integer with the number of possible result codes
121
122 **Returns**
123
124 - a 2-tuple containing:
125
126 1) the best information gain found so far
127
128 2) a list of the quantization bound indices ( _cuts_ for the best case)
129
130 **Notes**
131
132 - this is not even remotely efficient, which is why a C replacement
133 was written
134
135 """
136 nBounds = len(cuts)
137 maxGain = -1e6
138 bestCuts = None
139 highestCutHere = len(starts) - nBounds + which
140 if varTable is None:
141 varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
142 while cuts[which] <= highestCutHere:
143 varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
144 gainHere = entropy.InfoGain(varTable)
145 if gainHere > maxGain:
146 maxGain = gainHere
147 bestCuts = cuts[:]
148
149 if which < nBounds - 1:
150 gainHere, cutsHere = _RecurseOnBounds(vals, cuts[:], which + 1, starts, results, nPossibleRes,
151 varTable=varTable)
152 if gainHere > maxGain:
153 maxGain = gainHere
154 bestCuts = cutsHere
155
156 cuts[which] += 1
157 for i in range(which + 1, nBounds):
158 if cuts[i] == cuts[i - 1]:
159 cuts[i] += 1
160
161 return maxGain, bestCuts
162
163
165 """ Primarily intended for internal use
166
167 Recursively finds the best quantization boundaries
168
169 **Arguments**
170
171 - vals: a 1D Numeric array with the values of the variables,
172 this should be sorted
173
174 - cuts: a list with the indices of the quantization bounds
175 (indices are into _starts_ )
176
177 - which: an integer indicating which bound is being adjusted here
178 (and index into _cuts_ )
179
180 - starts: a list of potential starting points for quantization bounds
181
182 - results: a 1D Numeric array of integer result codes
183
184 - nPossibleRes: an integer with the number of possible result codes
185
186 **Returns**
187
188 - a 2-tuple containing:
189
190 1) the best information gain found so far
191
192 2) a list of the quantization bound indices ( _cuts_ for the best case)
193
194 **Notes**
195
196 - this is not even remotely efficient, which is why a C replacement
197 was written
198
199 """
200 nBounds = len(cuts)
201 maxGain = -1e6
202 bestCuts = None
203 highestCutHere = len(starts) - nBounds + which
204 if varTable is None:
205 varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
206 while cuts[which] <= highestCutHere:
207 gainHere = entropy.InfoGain(varTable)
208 if gainHere > maxGain:
209 maxGain = gainHere
210 bestCuts = cuts[:]
211
212 if which < nBounds - 1:
213 gainHere, cutsHere = _RecurseOnBounds(vals, cuts[:], which + 1, starts, results, nPossibleRes,
214 varTable=None)
215 if gainHere > maxGain:
216 maxGain = gainHere
217 bestCuts = cutsHere
218
219 oldCut = cuts[which]
220 cuts[which] += 1
221 bot = starts[oldCut]
222 if oldCut + 1 < len(starts):
223 top = starts[oldCut + 1]
224 else:
225 top = starts[-1]
226 for i in range(bot, top):
227 v = results[i]
228 varTable[which, v] += 1
229 varTable[which + 1, v] -= 1
230 for i in range(which + 1, nBounds):
231 if cuts[i] == cuts[i - 1]:
232 cuts[i] += 1
233
234 return maxGain, bestCuts
235
236
238
239
240
241
242
243
244
245
246
247
248
249
250 startNext = []
251 tol = 1e-8
252 blockAct = sortResults[0]
253 lastBlockAct = None
254 lastDiv = None
255 i = 1
256 while i < nData:
257
258 while i < nData and sortVals[i] - sortVals[i - 1] <= tol:
259 if sortResults[i] != blockAct:
260
261 blockAct = -1
262 i += 1
263 if lastBlockAct is None:
264
265 lastBlockAct = blockAct
266 lastDiv = i
267 else:
268 if blockAct == -1 or lastBlockAct == -1 or blockAct != lastBlockAct:
269 startNext.append(lastDiv)
270 lastDiv = i
271 lastBlockAct = blockAct
272 else:
273 lastDiv = i
274 if i < nData:
275 blockAct = sortResults[i]
276 i += 1
277
278 if blockAct != lastBlockAct:
279 startNext.append(lastDiv)
280 return startNext
281
282
284 """ finds multiple quantization bounds for a single variable
285
286 **Arguments**
287
288 - vals: sequence of variable values (assumed to be floats)
289
290 - nBounds: the number of quantization bounds to find
291
292 - results: a list of result codes (should be integers)
293
294 - nPossibleRes: an integer with the number of possible values of the
295 result variable
296
297 **Returns**
298
299 - a 2-tuple containing:
300
301 1) a list of the quantization bounds (floats)
302
303 2) the information gain associated with this quantization
304
305
306 """
307 assert len(vals) == len(results), 'vals/results length mismatch'
308
309 nData = len(vals)
310 if nData == 0:
311 return [], -1e8
312
313
314 svs = list(zip(vals, results))
315 svs.sort()
316 sortVals, sortResults = zip(*svs)
317 startNext = _FindStartPoints(sortVals, sortResults, nData)
318 if not len(startNext):
319 return [0], 0.0
320 if len(startNext) < nBounds:
321 nBounds = len(startNext) - 1
322 if nBounds == 0:
323 nBounds = 1
324 initCuts = list(range(nBounds))
325 maxGain, bestCuts = _RecurseOnBounds(sortVals, initCuts, 0, startNext, sortResults, nPossibleRes)
326 quantBounds = []
327 nVs = len(sortVals)
328 for cut in bestCuts:
329 idx = startNext[cut]
330 if idx == nVs:
331 quantBounds.append(sortVals[-1])
332 elif idx == 0:
333 quantBounds.append(sortVals[idx])
334 else:
335 quantBounds.append((sortVals[idx] + sortVals[idx - 1]) / 2.)
336
337 return quantBounds, maxGain
338
339
340 if hascQuantize:
341 _RecurseOnBounds = cQuantize._RecurseOnBounds
342 _FindStartPoints = cQuantize._FindStartPoints
343 else:
344 _RecurseOnBounds = _NewPyRecurseOnBounds
345 _FindStartPoints = _NewPyFindStartPoints
346
347 if __name__ == '__main__':
348 if 1:
349 d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0),
350 (2.1, 0), (2.2, 1), (2.3, 0)]
351 varValues = list(map(lambda x: x[0], d))
352 resCodes = list(map(lambda x: x[1], d))
353 nPossibleRes = 2
354 res = FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes)
355 print('RES:', res)
356 target = ([1.3, 2.05], .34707)
357 else:
358 d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0),
359 (2.1, 0), (2.2, 1), (2.3, 0)]
360 varValues = list(map(lambda x: x[0], d))
361 resCodes = list(map(lambda x: x[1], d))
362 nPossibleRes = 2
363 res = FindVarMultQuantBounds(varValues, 1, resCodes, nPossibleRes)
364 print(res)
365
366 d = [(1.4, 1), (1.4, 0)]
367
368 varValues = list(map(lambda x: x[0], d))
369 resCodes = list(map(lambda x: x[1], d))
370 nPossibleRes = 2
371 res = FindVarMultQuantBounds(varValues, 1, resCodes, nPossibleRes)
372 print(res)
373
374 d = [(1.4, 0), (1.4, 0), (1.6, 1)]
375 varValues = list(map(lambda x: x[0], d))
376 resCodes = list(map(lambda x: x[1], d))
377 nPossibleRes = 2
378 res = FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes)
379 print(res)
380