1
2
3
4
5 """ Utilities for data manipulation
6
7 **FILE FORMATS:**
8
9 - *.qdat files* contain quantized data suitable for
10 feeding to learning algorithms.
11
12 The .qdat file, written by _DecTreeGui_, is structured as follows:
13
14 1) Any number of lines which are ignored.
15
16 2) A line containing the string 'Variable Table'
17
18 any number of variable definitions in the format:
19
20 '# Variable_name [quant_bounds]'
21
22 where '[quant_bounds]' is a list of the boundaries used for quantizing
23 that variable. If the variable is inherently integral (i.e. not
24 quantized), this can be an empty list.
25
26 3) A line beginning with '# ----' which signals the end of the variable list
27
28 4) Any number of lines containing data points, in the format:
29
30 'Name_of_point var1 var2 var3 .... varN'
31
32 all variable values should be integers
33
34 Throughout, it is assumed that varN is the result
35
36 - *.dat files* contain the same information as .qdat files, but the variable
37 values can be anything (floats, ints, strings). **These files should
38 still contain quant_bounds!**
39
40 - *.qdat.pkl file* contain a pickled (binary) representation of
41 the data read in. They stores, in order:
42
43 1) A python list of the variable names
44
45 2) A python list of lists with the quantization bounds
46
47 3) A python list of the point names
48
49 4) A python list of lists with the data points
50
51 """
52 from __future__ import print_function
53
54 import csv
55 import random
56 import re
57
58 import numpy
59
60 from rdkit.DataStructs import BitUtils
61 from rdkit.ML.Data import MLData
62 from rdkit.six import integer_types
63 from rdkit.six.moves import cPickle
64 from rdkit.utils import fileutils
65
66
71
72
73 -def WriteData(outFile, varNames, qBounds, examples):
74 """ writes out a .qdat file
75
76 **Arguments**
77
78 - outFile: a file object
79
80 - varNames: a list of variable names
81
82 - qBounds: the list of quantization bounds (should be the same length
83 as _varNames_)
84
85 - examples: the data to be written
86
87 """
88 outFile.write('# Quantized data from DataUtils\n')
89 outFile.write('# ----------\n')
90 outFile.write('# Variable Table\n')
91 for i in range(len(varNames)):
92 outFile.write('# %s %s\n' % (varNames[i], str(qBounds[i])))
93 outFile.write('# ----------\n')
94 for example in examples:
95 outFile.write(' '.join([str(e) for e in example]) + '\n')
96
97
99 """ reads the variables and quantization bounds from a .qdat or .dat file
100
101 **Arguments**
102
103 - inFile: a file object
104
105 **Returns**
106
107 a 2-tuple containing:
108
109 1) varNames: a list of the variable names
110
111 2) qbounds: the list of quantization bounds for each variable
112
113 """
114 varNames = []
115 qBounds = []
116 fileutils.MoveToMatchingLine(inFile, 'Variable Table')
117 inLine = inFile.readline()
118 while inLine.find('# ----') == -1:
119 splitLine = inLine[2:].split('[')
120 varNames.append(splitLine[0].strip())
121 qBounds.append(splitLine[1][:-2])
122 inLine = inFile.readline()
123 for i in range(len(qBounds)):
124
125 if qBounds[i] != '':
126 l = qBounds[i].split(',')
127 qBounds[i] = []
128 for item in l:
129 qBounds[i].append(float(item))
130 else:
131 qBounds[i] = []
132 return varNames, qBounds
133
134
136 """ reads the examples from a .qdat file
137
138 **Arguments**
139
140 - inFile: a file object
141
142 **Returns**
143
144 a 2-tuple containing:
145
146 1) the names of the examples
147
148 2) a list of lists containing the examples themselves
149
150 **Note**
151
152 because this is reading a .qdat file, it assumed that all variable values
153 are integers
154
155 """
156 expr1 = re.compile(r'^#')
157 expr2 = re.compile(r'[\ ]+|[\t]+')
158 examples = []
159 names = []
160 inLine = inFile.readline()
161 while inLine:
162 if expr1.search(inLine) is None:
163 resArr = expr2.split(inLine)
164 if len(resArr) > 1:
165 examples.append([int(x) for x in resArr[1:]])
166 names.append(resArr[0])
167 inLine = inFile.readline()
168 return names, examples
169
170
172 """ reads the examples from a .dat file
173
174 **Arguments**
175
176 - inFile: a file object
177
178 **Returns**
179
180 a 2-tuple containing:
181
182 1) the names of the examples
183
184 2) a list of lists containing the examples themselves
185
186 **Note**
187
188 - this attempts to convert variable values to ints, then floats.
189 if those both fail, they are left as strings
190
191 """
192 expr1 = re.compile(r'^#')
193 expr2 = re.compile(r'[\ ]+|[\t]+')
194 examples = []
195 names = []
196 inLine = inFile.readline()
197 while inLine:
198 if expr1.search(inLine) is None:
199 resArr = expr2.split(inLine)[:-1]
200 if len(resArr) > 1:
201 for i in range(1, len(resArr)):
202 d = resArr[i]
203 try:
204 resArr[i] = int(d)
205 except ValueError:
206 try:
207 resArr[i] = float(d)
208 except ValueError:
209 pass
210 examples.append(resArr[1:])
211 names.append(resArr[0])
212 inLine = inFile.readline()
213 return names, examples
214
215
217 """ builds a data set from a .qdat file
218
219 **Arguments**
220
221 - fileName: the name of the .qdat file
222
223 **Returns**
224
225 an _MLData.MLQuantDataSet_
226
227 """
228 with open(fileName, 'r') as inFile:
229 varNames, qBounds = ReadVars(inFile)
230 ptNames, examples = ReadQuantExamples(inFile)
231 data = MLData.MLQuantDataSet(examples, qBounds=qBounds, varNames=varNames, ptNames=ptNames)
232 return data
233
234
236 """ builds a data set from a .dat file
237
238 **Arguments**
239
240 - fileName: the name of the .dat file
241
242 **Returns**
243
244 an _MLData.MLDataSet_
245
246 """
247 with open(fileName, 'r') as inFile:
248 varNames, qBounds = ReadVars(inFile)
249 ptNames, examples = ReadGeneralExamples(inFile)
250 data = MLData.MLDataSet(examples, qBounds=qBounds, varNames=varNames, ptNames=ptNames)
251 return data
252
253
255 """ calculates the number of possible values for each variable in a data set
256
257 **Arguments**
258
259 - data: a list of examples
260
261 - order: the ordering map between the variables in _data_ and _qBounds_
262
263 - qBounds: the quantization bounds for the variables
264
265 **Returns**
266
267 a list with the number of possible values each variable takes on in the data set
268
269 **Notes**
270
271 - variables present in _qBounds_ will have their _nPossible_ number read
272 from _qbounds
273
274 - _nPossible_ for other numeric variables will be calculated
275
276 """
277 numericTypes = integer_types + (float, numpy.int64, numpy.int32, numpy.int16)
278
279 if not silent:
280 print('order:', order, len(order))
281 print('qB:', qBounds)
282
283 assert (qBounds and len(order) == len(qBounds)) or (nQBounds and len(order) == len(nQBounds)), \
284 'order/qBounds mismatch'
285 nVars = len(order)
286 nPossible = [-1] * nVars
287 cols = list(range(nVars))
288 for i in range(nVars):
289 if nQBounds and nQBounds[i] != 0:
290 nPossible[i] = -1
291 cols.remove(i)
292 elif len(qBounds[i]) > 0:
293 nPossible[i] = len(qBounds[i])
294 cols.remove(i)
295
296 nPts = len(data)
297 for i in range(nPts):
298 for col in cols[:]:
299 d = data[i][order[col]]
300 if type(d) in numericTypes:
301 if int(d) == d:
302 nPossible[col] = max(int(d), nPossible[col])
303 else:
304 nPossible[col] = -1
305 cols.remove(col)
306 else:
307 if not silent:
308 print('bye bye col %d: %s' % (col, repr(d)))
309 nPossible[col] = -1
310 cols.remove(col)
311 return [int(x) + 1 for x in nPossible]
312
313
315 """ writes either a .qdat.pkl or a .dat.pkl file
316
317 **Arguments**
318
319 - outName: the name of the file to be used
320
321 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_
322
323 """
324 varNames = data.GetVarNames()
325 qBounds = data.GetQuantBounds()
326 ptNames = data.GetPtNames()
327 examples = data.GetAllData()
328 with open(outName, 'wb+') as outFile:
329 cPickle.dump(varNames, outFile)
330 cPickle.dump(qBounds, outFile)
331 cPickle.dump(ptNames, outFile)
332 cPickle.dump(examples, outFile)
333
334
336 """
337
338 >>> v = [10,20,30,40,50]
339 >>> TakeEnsemble(v,(1,2,3))
340 [20, 30, 40]
341 >>> v = ['foo',10,20,30,40,50,1]
342 >>> TakeEnsemble(v,(1,2,3),isDataVect=True)
343 ['foo', 20, 30, 40, 1]
344
345 """
346 if isDataVect:
347 ensembleIds = [x + 1 for x in ensembleIds]
348 vect = [vect[0]] + [vect[x] for x in ensembleIds] + [vect[-1]]
349 else:
350 vect = [vect[x] for x in ensembleIds]
351 return vect
352
353
354 -def DBToData(dbName, tableName, user='sysdba', password='masterkey', dupCol=-1, what='*', where='',
355 join='', pickleCol=-1, pickleClass=None, ensembleIds=None):
356 """ constructs an _MLData.MLDataSet_ from a database
357
358 **Arguments**
359
360 - dbName: the name of the database to be opened
361
362 - tableName: the table name containing the data in the database
363
364 - user: the user name to be used to connect to the database
365
366 - password: the password to be used to connect to the database
367
368 - dupCol: if nonzero specifies which column should be used to recognize
369 duplicates.
370
371 **Returns**
372
373 an _MLData.MLDataSet_
374
375 **Notes**
376
377 - this uses Dbase.DataUtils functionality
378
379 """
380 from rdkit.Dbase.DbConnection import DbConnect
381 conn = DbConnect(dbName, tableName, user, password)
382 res = conn.GetData(fields=what, where=where, join=join, removeDups=dupCol, forceList=1)
383 nPts = len(res)
384 vals = [None] * nPts
385 ptNames = [None] * nPts
386 classWorks = True
387 for i in range(nPts):
388 tmp = list(res[i])
389 ptNames[i] = tmp.pop(0)
390 if pickleCol >= 0:
391 if not pickleClass or not classWorks:
392 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
393 else:
394 try:
395 tmp[pickleCol] = pickleClass(str(tmp[pickleCol]))
396 except Exception:
397 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
398 classWorks = False
399 if ensembleIds:
400 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol], ensembleIds)
401 else:
402 if ensembleIds:
403 tmp = TakeEnsemble(tmp, ensembleIds, isDataVect=True)
404 vals[i] = tmp
405 varNames = conn.GetColumnNames(join=join, what=what)
406 data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames)
407 return data
408
409
410 -def TextToData(reader, ignoreCols=[], onlyCols=None):
411 """ constructs an _MLData.MLDataSet_ from a bunch of text
412 #DOC
413 **Arguments**
414 - reader needs to be iterable and return lists of elements
415 (like a csv.reader)
416
417 **Returns**
418
419 an _MLData.MLDataSet_
420
421 """
422
423 varNames = next(reader)
424 if not onlyCols:
425 keepCols = []
426 for i, name in enumerate(varNames):
427 if name not in ignoreCols:
428 keepCols.append(i)
429 else:
430 keepCols = [-1] * len(onlyCols)
431 for i, name in enumerate(varNames):
432 if name in onlyCols:
433 keepCols[onlyCols.index(name)] = i
434
435 nCols = len(varNames)
436 varNames = tuple([varNames[x] for x in keepCols])
437 nVars = len(varNames)
438 vals = []
439 ptNames = []
440 for splitLine in reader:
441 if len(splitLine):
442 if len(splitLine) != nCols:
443 raise ValueError('unequal line lengths')
444 tmp = [splitLine[x] for x in keepCols]
445 ptNames.append(tmp[0])
446 pt = [None] * (nVars - 1)
447 for j in range(nVars - 1):
448 try:
449 val = int(tmp[j + 1])
450 except ValueError:
451 try:
452 val = float(tmp[j + 1])
453 except ValueError:
454 val = str(tmp[j + 1])
455 pt[j] = val
456 vals.append(pt)
457 data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames)
458 return data
459
460
461 -def TextFileToData(fName, onlyCols=None):
462 """
463 #DOC
464
465 """
466 ext = fName.split('.')[-1]
467 with open(fName, 'r') as inF:
468 if ext.upper() == 'CSV':
469
470 splitter = csv.reader(inF)
471 else:
472 splitter = csv.reader(inF, delimiter='\t')
473 res = TextToData(splitter, onlyCols=onlyCols)
474 return res
475
476
478 """ Seeds the random number generators
479
480 **Arguments**
481
482 - seed: a 2-tuple containing integers to be used as the random number seeds
483
484 **Notes**
485
486 this seeds both the RDRandom generator and the one in the standard
487 Python _random_ module
488
489 """
490 from rdkit import RDRandom
491 RDRandom.seed(seed[0])
492 random.seed(seed[0])
493
494
495 -def FilterData(inData, val, frac, col=-1, indicesToUse=None, indicesOnly=0):
496 """
497 #DOC
498 """
499 if frac < 0 or frac > 1:
500 raise ValueError('filter fraction out of bounds')
501 try:
502 inData[0][col]
503 except IndexError:
504 raise ValueError('target column index out of range')
505
506
507 if indicesToUse:
508 tmp = [inData[x] for x in indicesToUse]
509 else:
510 tmp = list(inData)
511 nOrig = len(tmp)
512 sortOrder = list(range(nOrig))
513 sortOrder.sort(key=lambda x: tmp[x][col])
514 tmp = [tmp[x] for x in sortOrder]
515
516
517 start = 0
518 while start < nOrig and tmp[start][col] != val:
519 start += 1
520 if start >= nOrig:
521 raise ValueError('target value (%d) not found in data' % (val))
522
523
524 finish = start + 1
525 while finish < nOrig and tmp[finish][col] == val:
526 finish += 1
527
528
529 nWithVal = finish - start
530
531
532 nOthers = len(tmp) - nWithVal
533
534 currFrac = float(nWithVal) / nOrig
535 if currFrac < frac:
536
537
538
539
540
541 nTgtFinal = nWithVal
542 nFinal = int(round(nWithVal / frac))
543 nOthersFinal = nFinal - nTgtFinal
544
545
546
547
548
549
550 while float(nTgtFinal) / nFinal > frac:
551 nTgtFinal -= 1
552 nFinal -= 1
553
554 else:
555
556
557
558
559
560 nOthersFinal = nOthers
561 nFinal = int(round(nOthers / (1 - frac)))
562 nTgtFinal = nFinal - nOthersFinal
563
564
565
566
567
568
569 while float(nTgtFinal) / nFinal < frac:
570 nOthersFinal -= 1
571 nFinal -= 1
572
573 others = list(range(start)) + list(range(finish, nOrig))
574 othersTake = permutation(nOthers)
575 others = [others[x] for x in othersTake[:nOthersFinal]]
576
577 targets = list(range(start, finish))
578 targetsTake = permutation(nWithVal)
579 targets = [targets[x] for x in targetsTake[:nTgtFinal]]
580
581
582 indicesToKeep = targets + others
583
584 res = []
585 rej = []
586
587 if not indicesOnly:
588 for i in permutation(nOrig):
589 if i in indicesToKeep:
590 res.append(tmp[i])
591 else:
592 rej.append(tmp[i])
593 else:
594
595 for i in permutation(nOrig):
596 if not indicesToUse:
597 idx = sortOrder[i]
598 else:
599 idx = indicesToUse[sortOrder[i]]
600 if i in indicesToKeep:
601 res.append(idx)
602 else:
603 rej.append(idx)
604 return res, rej
605
606
608 """ #DOC
609 """
610 counts = {}
611 for p in inData:
612 if not bounds:
613 r = p[col]
614 else:
615 act = p[col]
616 bound = 0
617 placed = 0
618 while not placed and bound < len(bounds):
619 if act < bounds[bound]:
620 r = bound
621 placed = 1
622 else:
623 bound += 1
624 if not placed:
625 r = bound
626
627 counts[r] = counts.get(r, 0) + 1
628 return counts
629
630
632 """ randomizes the activity values of a dataset
633
634 **Arguments**
635
636 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized
637
638 - shuffle: an optional toggle. If this is set, the activity values
639 will be shuffled (so the number in each class remains constant)
640
641 - runDetails: an optional CompositeRun object
642
643 **Note**
644
645 - _examples_ are randomized in place
646
647
648 """
649 nPts = dataSet.GetNPts()
650 if shuffle:
651 if runDetails:
652 runDetails.shuffled = 1
653 acts = dataSet.GetResults()[:]
654
655
656 random.shuffle(acts, random=random.random)
657 else:
658 if runDetails:
659 runDetails.randomized = 1
660 nPossible = dataSet.GetNPossibleVals()[-1]
661 acts = [random.randint(0, nPossible) for _ in len(examples)]
662 for i in range(nPts):
663 tmp = dataSet[i]
664 tmp[-1] = acts[i]
665 dataSet[i] = tmp
666
667
668
669
670
671
673 import sys
674 import doctest
675 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose)
676 sys.exit(failed)
677
678
679 if __name__ == '__main__':
680 _runDoctests()
681