1
2
3
4
5 """ classes to be used to help work with data sets
6
7 """
8 from __future__ import print_function
9
10 import copy
11 import math
12
13 import numpy
14
15 from rdkit.six import integer_types
16
17
18 numericTypes = integer_types + (float, )
19
20
22 """ A data set for holding general data (floats, ints, and strings)
23
24 **Note**
25 this is intended to be a read-only data structure
26 (i.e. after calling the constructor you cannot touch it)
27 """
28
29 - def __init__(self, data, nVars=None, nPts=None, nPossibleVals=None, qBounds=None, varNames=None,
30 ptNames=None, nResults=1):
31 """ Constructor
32
33 **Arguments**
34
35 - data: a list of lists containing the data. The data are copied, so don't worry
36 about us overwriting them.
37
38 - nVars: the number of variables
39
40 - nPts: the number of points
41
42 - nPossibleVals: an list containing the number of possible values
43 for each variable (should contain 0 when not relevant)
44 This is _nVars_ long
45
46 - qBounds: a list of lists containing quantization bounds for variables
47 which are to be quantized (note, this class does not quantize
48 the variables itself, it merely stores quantization bounds.
49 an empty sublist indicates no quantization for a given variable
50 This is _nVars_ long
51
52 - varNames: a list of the names of the variables.
53 This is _nVars_ long
54
55 - ptNames: the names (labels) of the individual data points
56 This is _nPts_ long
57
58 - nResults: the number of results columns in the data lists. This is usually
59 1, but can be higher.
60 """
61 self.data = [x[:] for x in data]
62 self.nResults = nResults
63 if nVars is None:
64 nVars = len(self.data[0]) - self.nResults
65 self.nVars = nVars
66 if nPts is None:
67 nPts = len(data)
68 self.nPts = nPts
69 if qBounds is None:
70 qBounds = [[]] * len(self.data[0])
71 self.qBounds = qBounds
72 if nPossibleVals is None:
73 nPossibleVals = self._CalcNPossible(self.data)
74 self.nPossibleVals = nPossibleVals
75 if varNames is None:
76 varNames = [''] * self.nVars
77 self.varNames = varNames
78 if ptNames is None:
79 ptNames = [''] * self.nPts
80 self.ptNames = ptNames
81
83 """calculates the number of possible values of each variable (where possible)
84
85 **Arguments**
86
87 -data: a list of examples to be used
88
89 **Returns**
90
91 a list of nPossible values for each variable
92
93 """
94 nVars = self.GetNVars() + self.nResults
95 nPossible = [-1] * nVars
96 cols = list(range(nVars))
97 for i, bounds in enumerate(self.qBounds):
98 if len(bounds) > 0:
99 nPossible[i] = len(bounds)
100 cols.remove(i)
101
102 for i, pt in enumerate(self.data):
103 for col in cols[:]:
104 d = pt[col]
105 if type(d) in numericTypes:
106 if math.floor(d) == d:
107 nPossible[col] = max(math.floor(d), nPossible[col])
108 else:
109 nPossible[col] = -1
110 cols.remove(col)
111 else:
112 nPossible[col] = -1
113 cols.remove(col)
114 return [int(x) + 1 for x in nPossible]
115
118
121
124
126 return self.nPossibleVals
127
130
132 res = [self.ptNames[idx]] + self.data[idx][:]
133 return res
134
136 if len(val) != self.GetNVars() + self.GetNResults() + 1:
137 raise ValueError('bad value in assignment')
138 self.ptNames[idx] = val[0]
139 self.data[idx] = val[1:]
140 return val
141
143 """ returns a list of named examples
144
145 **Note**
146
147 a named example is the result of prepending the example
148 name to the data list
149
150 """
151 res = [None] * self.nPts
152 for i in range(self.nPts):
153 res[i] = [self.ptNames[i]] + self.data[i][:]
154 return res
155
157 """ returns a *copy* of the data
158
159 """
160 return copy.deepcopy(self.data)
161
173
175 """ Returns the result fields from each example
176
177 """
178 if self.GetNResults() > 1:
179 v = self.GetNResults()
180 res = [x[-v:] for x in self.data]
181 else:
182 res = [x[-1] for x in self.data]
183 return res
184
187
190
192 self.data.append(pt[1:])
193 self.ptNames.append(pt[0])
194 self.nPts += 1
195
197 if len(pts) != len(names):
198 raise ValueError("input length mismatch")
199 self.data += pts
200 self.ptNames += names
201 self.nPts = len(self.data)
202
203
205 """ a data set for holding quantized data
206
207
208 **Note**
209
210 this is intended to be a read-only data structure
211 (i.e. after calling the constructor you cannot touch it)
212
213 **Big differences to MLDataSet**
214
215 1) data are stored in a numpy array since they are homogenous
216
217 2) results are assumed to be quantized (i.e. no qBounds entry is required)
218
219 """
220
222 """calculates the number of possible values of each variable
223
224 **Arguments**
225
226 -data: a list of examples to be used
227
228 **Returns**
229
230 a list of nPossible values for each variable
231
232 """
233 return [max(x) + 1 for x in numpy.transpose(data)]
234
236 """ returns a list of named examples
237
238 **Note**
239
240 a named example is the result of prepending the example
241 name to the data list
242
243 """
244 res = [None] * self.nPts
245 for i in range(self.nPts):
246 res[i] = [self.ptNames[i]] + self.data[i].tolist()
247 return res
248
250 """ returns a *copy* of the data
251
252 """
253 return self.data.tolist()
254
265
267 """ Returns the result fields from each example
268
269 """
270 if self.GetNResults() > 1:
271 v = self.GetNResults()
272 res = [x[-v:] for x in self.data]
273 else:
274 res = [x[-1] for x in self.data]
275 return res
276
277 - def __init__(self, data, nVars=None, nPts=None, nPossibleVals=None, qBounds=None, varNames=None,
278 ptNames=None, nResults=1):
279 """ Constructor
280
281 **Arguments**
282
283 - data: a list of lists containing the data. The data are copied, so don't worry
284 about us overwriting them.
285
286 - nVars: the number of variables
287
288 - nPts: the number of points
289
290 - nPossibleVals: an list containing the number of possible values
291 for each variable (should contain 0 when not relevant)
292 This is _nVars_ long
293
294 - qBounds: a list of lists containing quantization bounds for variables
295 which are to be quantized (note, this class does not quantize
296 the variables itself, it merely stores quantization bounds.
297 an empty sublist indicates no quantization for a given variable
298 This is _nVars_ long
299
300 - varNames: a list of the names of the variables.
301 This is _nVars_ long
302
303 - ptNames: the names (labels) of the individual data points
304 This is _nPts_ long
305
306 - nResults: the number of results columns in the data lists. This is usually
307 1, but can be higher.
308 """
309 self.data = numpy.array(data)
310 self.nResults = nResults
311 if nVars is None:
312 nVars = len(data[0]) - self.nResults
313 self.nVars = nVars
314 if nPts is None:
315 nPts = len(data)
316 self.nPts = nPts
317 if qBounds is None:
318 qBounds = [[]] * self.nVars
319 self.qBounds = qBounds
320 if nPossibleVals is None:
321 nPossibleVals = self._CalcNPossible(data)
322 self.nPossibleVals = nPossibleVals
323 if varNames is None:
324 varNames = [''] * self.nVars
325 self.varNames = varNames
326 if ptNames is None:
327 ptNames = [''] * self.nPts
328 self.ptNames = ptNames
329
330
331 if __name__ == '__main__':
332 from . import DataUtils
333 examples = [[0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [1, 0, 0, 0, 1], [2, 1, 0, 0, 1], [2, 2, 1, 0, 1]]
334 varNames = ['foo1', 'foo2', 'foo3', 'foo4', 'res']
335 ptNames = ['p1', 'p2', 'p3', 'p4', 'p5']
336 dataset = MLQuantDataSet(examples, varNames=varNames, ptNames=ptNames)
337 DataUtils.WritePickledData('test_data/test.qdat.pkl', dataset)
338 print('nVars:', dataset.GetNVars())
339 print('nPts:', dataset.GetNPts())
340 print('nPoss:', dataset.GetNPossibleVals())
341 print('qBounds:', dataset.GetQuantBounds())
342 print('data:', dataset.GetAllData())
343 print('Input data:', dataset.GetInputData())
344 print('results:', dataset.GetResults())
345
346 print('nameddata:', dataset.GetNamedData())
347
348 examples = [
349 ['foo', 1, 1.0, 1, 1.1],
350 ['foo', 2, 1.0, 1, 2.1],
351 ['foo', 3, 1.2, 1.1, 3.1],
352 ['foo', 4, 1.0, 1, 4.1],
353 ['foo', 5, 1.1, 1, 5.1],
354 ]
355 qBounds = [[], [], [], [], [2, 4]]
356 varNames = ['foo1', 'foo2', 'foo3', 'foo4', 'res']
357 ptNames = ['p1', 'p2', 'p3', 'p4', 'p5']
358 dataset = MLDataSet(examples, qBounds=qBounds)
359 DataUtils.WritePickledData('test_data/test.dat.pkl', dataset)
360 print('nVars:', dataset.GetNVars())
361 print('nPts:', dataset.GetNPts())
362 print('nPoss:', dataset.GetNPossibleVals())
363 print('qBounds:', dataset.GetQuantBounds())
364 print('data:', dataset.GetAllData())
365 print('Input data:', dataset.GetInputData())
366 print('results:', dataset.GetResults())
367
368 print('nameddata:', dataset.GetNamedData())
369