Package rdkit :: Package ML :: Package Data :: Module DataUtils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.DataUtils

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  #   All Rights Reserved 
  4  # 
  5  """ Utilities for data manipulation 
  6   
  7  **FILE FORMATS:** 
  8   
  9   - *.qdat files* contain quantized data suitable for 
 10    feeding to learning algorithms. 
 11   
 12    The .qdat file, written by _DecTreeGui_, is structured as follows: 
 13   
 14     1) Any number of lines which are ignored. 
 15   
 16     2) A line containing the string 'Variable Table' 
 17   
 18        any number of variable definitions in the format: 
 19   
 20        '# Variable_name [quant_bounds]' 
 21   
 22          where '[quant_bounds]' is a list of the boundaries used for quantizing 
 23           that variable.  If the variable is inherently integral (i.e. not 
 24           quantized), this can be an empty list. 
 25   
 26     3) A line beginning with '# ----' which signals the end of the variable list 
 27   
 28     4) Any number of lines containing data points, in the format: 
 29   
 30        'Name_of_point var1 var2 var3 .... varN' 
 31   
 32        all variable values should be integers 
 33   
 34     Throughout, it is assumed that varN is the result 
 35   
 36   - *.dat files* contain the same information as .qdat files, but the variable 
 37     values can be anything (floats, ints, strings).  **These files should 
 38     still contain quant_bounds!** 
 39   
 40   - *.qdat.pkl file* contain a pickled (binary) representation of 
 41     the data read in.  They stores, in order: 
 42   
 43      1) A python list of the variable names 
 44   
 45      2) A python list of lists with the quantization bounds 
 46   
 47      3) A python list of the point names 
 48   
 49      4) A python list of lists with the data points 
 50   
 51  """ 
 52  from __future__ import print_function 
 53   
 54  import csv 
 55  import random 
 56  import re 
 57   
 58  import numpy 
 59   
 60  from rdkit.DataStructs import BitUtils 
 61  from rdkit.ML.Data import MLData 
 62  from rdkit.six import integer_types 
 63  from rdkit.six.moves import cPickle 
 64  from rdkit.utils import fileutils 
 65   
 66   
67 -def permutation(nToDo):
68 res = list(range(nToDo)) 69 random.shuffle(res, random=random.random) 70 return res
71 72
73 -def WriteData(outFile, varNames, qBounds, examples):
74 """ writes out a .qdat file 75 76 **Arguments** 77 78 - outFile: a file object 79 80 - varNames: a list of variable names 81 82 - qBounds: the list of quantization bounds (should be the same length 83 as _varNames_) 84 85 - examples: the data to be written 86 87 """ 88 outFile.write('# Quantized data from DataUtils\n') 89 outFile.write('# ----------\n') 90 outFile.write('# Variable Table\n') 91 for i in range(len(varNames)): 92 outFile.write('# %s %s\n' % (varNames[i], str(qBounds[i]))) 93 outFile.write('# ----------\n') 94 for example in examples: 95 outFile.write(' '.join([str(e) for e in example]) + '\n')
96 97
98 -def ReadVars(inFile):
99 """ reads the variables and quantization bounds from a .qdat or .dat file 100 101 **Arguments** 102 103 - inFile: a file object 104 105 **Returns** 106 107 a 2-tuple containing: 108 109 1) varNames: a list of the variable names 110 111 2) qbounds: the list of quantization bounds for each variable 112 113 """ 114 varNames = [] 115 qBounds = [] 116 fileutils.MoveToMatchingLine(inFile, 'Variable Table') 117 inLine = inFile.readline() 118 while inLine.find('# ----') == -1: 119 splitLine = inLine[2:].split('[') 120 varNames.append(splitLine[0].strip()) 121 qBounds.append(splitLine[1][:-2]) 122 inLine = inFile.readline() 123 for i in range(len(qBounds)): 124 125 if qBounds[i] != '': 126 l = qBounds[i].split(',') 127 qBounds[i] = [] 128 for item in l: 129 qBounds[i].append(float(item)) 130 else: 131 qBounds[i] = [] 132 return varNames, qBounds
133 134
135 -def ReadQuantExamples(inFile):
136 """ reads the examples from a .qdat file 137 138 **Arguments** 139 140 - inFile: a file object 141 142 **Returns** 143 144 a 2-tuple containing: 145 146 1) the names of the examples 147 148 2) a list of lists containing the examples themselves 149 150 **Note** 151 152 because this is reading a .qdat file, it assumed that all variable values 153 are integers 154 155 """ 156 expr1 = re.compile(r'^#') 157 expr2 = re.compile(r'[\ ]+|[\t]+') 158 examples = [] 159 names = [] 160 inLine = inFile.readline() 161 while inLine: 162 if expr1.search(inLine) is None: 163 resArr = expr2.split(inLine) 164 if len(resArr) > 1: 165 examples.append([int(x) for x in resArr[1:]]) 166 names.append(resArr[0]) 167 inLine = inFile.readline() 168 return names, examples
169 170
171 -def ReadGeneralExamples(inFile):
172 """ reads the examples from a .dat file 173 174 **Arguments** 175 176 - inFile: a file object 177 178 **Returns** 179 180 a 2-tuple containing: 181 182 1) the names of the examples 183 184 2) a list of lists containing the examples themselves 185 186 **Note** 187 188 - this attempts to convert variable values to ints, then floats. 189 if those both fail, they are left as strings 190 191 """ 192 expr1 = re.compile(r'^#') 193 expr2 = re.compile(r'[\ ]+|[\t]+') 194 examples = [] 195 names = [] 196 inLine = inFile.readline() 197 while inLine: 198 if expr1.search(inLine) is None: 199 resArr = expr2.split(inLine)[:-1] 200 if len(resArr) > 1: 201 for i in range(1, len(resArr)): 202 d = resArr[i] 203 try: 204 resArr[i] = int(d) 205 except ValueError: 206 try: 207 resArr[i] = float(d) 208 except ValueError: 209 pass 210 examples.append(resArr[1:]) 211 names.append(resArr[0]) 212 inLine = inFile.readline() 213 return names, examples
214 215
216 -def BuildQuantDataSet(fileName):
217 """ builds a data set from a .qdat file 218 219 **Arguments** 220 221 - fileName: the name of the .qdat file 222 223 **Returns** 224 225 an _MLData.MLQuantDataSet_ 226 227 """ 228 with open(fileName, 'r') as inFile: 229 varNames, qBounds = ReadVars(inFile) 230 ptNames, examples = ReadQuantExamples(inFile) 231 data = MLData.MLQuantDataSet(examples, qBounds=qBounds, varNames=varNames, ptNames=ptNames) 232 return data
233 234
235 -def BuildDataSet(fileName):
236 """ builds a data set from a .dat file 237 238 **Arguments** 239 240 - fileName: the name of the .dat file 241 242 **Returns** 243 244 an _MLData.MLDataSet_ 245 246 """ 247 with open(fileName, 'r') as inFile: 248 varNames, qBounds = ReadVars(inFile) 249 ptNames, examples = ReadGeneralExamples(inFile) 250 data = MLData.MLDataSet(examples, qBounds=qBounds, varNames=varNames, ptNames=ptNames) 251 return data
252 253
254 -def CalcNPossibleUsingMap(data, order, qBounds, nQBounds=None, silent=True):
255 """ calculates the number of possible values for each variable in a data set 256 257 **Arguments** 258 259 - data: a list of examples 260 261 - order: the ordering map between the variables in _data_ and _qBounds_ 262 263 - qBounds: the quantization bounds for the variables 264 265 **Returns** 266 267 a list with the number of possible values each variable takes on in the data set 268 269 **Notes** 270 271 - variables present in _qBounds_ will have their _nPossible_ number read 272 from _qbounds 273 274 - _nPossible_ for other numeric variables will be calculated 275 276 """ 277 numericTypes = integer_types + (float, numpy.int64, numpy.int32, numpy.int16) 278 279 if not silent: 280 print('order:', order, len(order)) 281 print('qB:', qBounds) 282 # print('nQB:',nQBounds, len(nQBounds)) 283 assert (qBounds and len(order) == len(qBounds)) or (nQBounds and len(order) == len(nQBounds)), \ 284 'order/qBounds mismatch' 285 nVars = len(order) 286 nPossible = [-1] * nVars 287 cols = list(range(nVars)) 288 for i in range(nVars): 289 if nQBounds and nQBounds[i] != 0: 290 nPossible[i] = -1 291 cols.remove(i) 292 elif len(qBounds[i]) > 0: 293 nPossible[i] = len(qBounds[i]) 294 cols.remove(i) 295 296 nPts = len(data) 297 for i in range(nPts): 298 for col in cols[:]: 299 d = data[i][order[col]] 300 if type(d) in numericTypes: 301 if int(d) == d: 302 nPossible[col] = max(int(d), nPossible[col]) 303 else: 304 nPossible[col] = -1 305 cols.remove(col) 306 else: 307 if not silent: 308 print('bye bye col %d: %s' % (col, repr(d))) 309 nPossible[col] = -1 310 cols.remove(col) 311 return [int(x) + 1 for x in nPossible]
312 313
314 -def WritePickledData(outName, data):
315 """ writes either a .qdat.pkl or a .dat.pkl file 316 317 **Arguments** 318 319 - outName: the name of the file to be used 320 321 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_ 322 323 """ 324 varNames = data.GetVarNames() 325 qBounds = data.GetQuantBounds() 326 ptNames = data.GetPtNames() 327 examples = data.GetAllData() 328 with open(outName, 'wb+') as outFile: 329 cPickle.dump(varNames, outFile) 330 cPickle.dump(qBounds, outFile) 331 cPickle.dump(ptNames, outFile) 332 cPickle.dump(examples, outFile)
333 334
335 -def TakeEnsemble(vect, ensembleIds, isDataVect=False):
336 """ 337 338 >>> v = [10,20,30,40,50] 339 >>> TakeEnsemble(v,(1,2,3)) 340 [20, 30, 40] 341 >>> v = ['foo',10,20,30,40,50,1] 342 >>> TakeEnsemble(v,(1,2,3),isDataVect=True) 343 ['foo', 20, 30, 40, 1] 344 345 """ 346 if isDataVect: 347 ensembleIds = [x + 1 for x in ensembleIds] 348 vect = [vect[0]] + [vect[x] for x in ensembleIds] + [vect[-1]] 349 else: 350 vect = [vect[x] for x in ensembleIds] 351 return vect
352 353
354 -def DBToData(dbName, tableName, user='sysdba', password='masterkey', dupCol=-1, what='*', where='', 355 join='', pickleCol=-1, pickleClass=None, ensembleIds=None):
356 """ constructs an _MLData.MLDataSet_ from a database 357 358 **Arguments** 359 360 - dbName: the name of the database to be opened 361 362 - tableName: the table name containing the data in the database 363 364 - user: the user name to be used to connect to the database 365 366 - password: the password to be used to connect to the database 367 368 - dupCol: if nonzero specifies which column should be used to recognize 369 duplicates. 370 371 **Returns** 372 373 an _MLData.MLDataSet_ 374 375 **Notes** 376 377 - this uses Dbase.DataUtils functionality 378 379 """ 380 from rdkit.Dbase.DbConnection import DbConnect 381 conn = DbConnect(dbName, tableName, user, password) 382 res = conn.GetData(fields=what, where=where, join=join, removeDups=dupCol, forceList=1) 383 nPts = len(res) 384 vals = [None] * nPts 385 ptNames = [None] * nPts 386 classWorks = True 387 for i in range(nPts): 388 tmp = list(res[i]) 389 ptNames[i] = tmp.pop(0) 390 if pickleCol >= 0: 391 if not pickleClass or not classWorks: 392 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 393 else: 394 try: 395 tmp[pickleCol] = pickleClass(str(tmp[pickleCol])) 396 except Exception: 397 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 398 classWorks = False 399 if ensembleIds: 400 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol], ensembleIds) 401 else: 402 if ensembleIds: 403 tmp = TakeEnsemble(tmp, ensembleIds, isDataVect=True) 404 vals[i] = tmp 405 varNames = conn.GetColumnNames(join=join, what=what) 406 data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames) 407 return data
408 409
410 -def TextToData(reader, ignoreCols=[], onlyCols=None):
411 """ constructs an _MLData.MLDataSet_ from a bunch of text 412 #DOC 413 **Arguments** 414 - reader needs to be iterable and return lists of elements 415 (like a csv.reader) 416 417 **Returns** 418 419 an _MLData.MLDataSet_ 420 421 """ 422 423 varNames = next(reader) 424 if not onlyCols: 425 keepCols = [] 426 for i, name in enumerate(varNames): 427 if name not in ignoreCols: 428 keepCols.append(i) 429 else: 430 keepCols = [-1] * len(onlyCols) 431 for i, name in enumerate(varNames): 432 if name in onlyCols: 433 keepCols[onlyCols.index(name)] = i 434 435 nCols = len(varNames) 436 varNames = tuple([varNames[x] for x in keepCols]) 437 nVars = len(varNames) 438 vals = [] 439 ptNames = [] 440 for splitLine in reader: 441 if len(splitLine): 442 if len(splitLine) != nCols: 443 raise ValueError('unequal line lengths') 444 tmp = [splitLine[x] for x in keepCols] 445 ptNames.append(tmp[0]) 446 pt = [None] * (nVars - 1) 447 for j in range(nVars - 1): 448 try: 449 val = int(tmp[j + 1]) 450 except ValueError: 451 try: 452 val = float(tmp[j + 1]) 453 except ValueError: 454 val = str(tmp[j + 1]) 455 pt[j] = val 456 vals.append(pt) 457 data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames) 458 return data
459 460
461 -def TextFileToData(fName, onlyCols=None):
462 """ 463 #DOC 464 465 """ 466 ext = fName.split('.')[-1] 467 with open(fName, 'r') as inF: 468 if ext.upper() == 'CSV': 469 # CSV module distributed with python2.3 and later 470 splitter = csv.reader(inF) 471 else: 472 splitter = csv.reader(inF, delimiter='\t') 473 res = TextToData(splitter, onlyCols=onlyCols) 474 return res
475 476
477 -def InitRandomNumbers(seed):
478 """ Seeds the random number generators 479 480 **Arguments** 481 482 - seed: a 2-tuple containing integers to be used as the random number seeds 483 484 **Notes** 485 486 this seeds both the RDRandom generator and the one in the standard 487 Python _random_ module 488 489 """ 490 from rdkit import RDRandom 491 RDRandom.seed(seed[0]) 492 random.seed(seed[0])
493 494
495 -def FilterData(inData, val, frac, col=-1, indicesToUse=None, indicesOnly=0):
496 """ 497 #DOC 498 """ 499 if frac < 0 or frac > 1: 500 raise ValueError('filter fraction out of bounds') 501 try: 502 inData[0][col] 503 except IndexError: 504 raise ValueError('target column index out of range') 505 506 # convert the input data to a list and sort them 507 if indicesToUse: 508 tmp = [inData[x] for x in indicesToUse] 509 else: 510 tmp = list(inData) 511 nOrig = len(tmp) 512 sortOrder = list(range(nOrig)) 513 sortOrder.sort(key=lambda x: tmp[x][col]) 514 tmp = [tmp[x] for x in sortOrder] 515 516 # find the start of the entries with value val 517 start = 0 518 while start < nOrig and tmp[start][col] != val: 519 start += 1 520 if start >= nOrig: 521 raise ValueError('target value (%d) not found in data' % (val)) 522 523 # find the end of the entries with value val 524 finish = start + 1 525 while finish < nOrig and tmp[finish][col] == val: 526 finish += 1 527 528 # how many entries have the target value? 529 nWithVal = finish - start 530 531 # how many don't? 532 nOthers = len(tmp) - nWithVal 533 534 currFrac = float(nWithVal) / nOrig 535 if currFrac < frac: 536 # 537 # We're going to keep most of (all) the points with the target value, 538 # We need to figure out how many of the other points we'll 539 # toss out 540 # 541 nTgtFinal = nWithVal 542 nFinal = int(round(nWithVal / frac)) 543 nOthersFinal = nFinal - nTgtFinal 544 545 # 546 # We may need to reduce the number of targets to keep 547 # because it may make it impossible to hit exactly the 548 # fraction we're trying for. Take care of that now 549 # 550 while float(nTgtFinal) / nFinal > frac: 551 nTgtFinal -= 1 552 nFinal -= 1 553 554 else: 555 # 556 # There are too many points with the target value, 557 # we'll keep most of (all) the other points and toss a random 558 # selection of the target value points 559 # 560 nOthersFinal = nOthers 561 nFinal = int(round(nOthers / (1 - frac))) 562 nTgtFinal = nFinal - nOthersFinal 563 564 # 565 # We may need to reduce the number of others to keep 566 # because it may make it impossible to hit exactly the 567 # fraction we're trying for. Take care of that now 568 # 569 while float(nTgtFinal) / nFinal < frac: 570 nOthersFinal -= 1 571 nFinal -= 1 572 573 others = list(range(start)) + list(range(finish, nOrig)) 574 othersTake = permutation(nOthers) 575 others = [others[x] for x in othersTake[:nOthersFinal]] 576 577 targets = list(range(start, finish)) 578 targetsTake = permutation(nWithVal) 579 targets = [targets[x] for x in targetsTake[:nTgtFinal]] 580 581 # these are all the indices we'll be keeping 582 indicesToKeep = targets + others 583 584 res = [] 585 rej = [] 586 # now pull the points, but in random order 587 if not indicesOnly: 588 for i in permutation(nOrig): 589 if i in indicesToKeep: 590 res.append(tmp[i]) 591 else: 592 rej.append(tmp[i]) 593 else: 594 # EFF: this is slower than it needs to be 595 for i in permutation(nOrig): 596 if not indicesToUse: 597 idx = sortOrder[i] 598 else: 599 idx = indicesToUse[sortOrder[i]] 600 if i in indicesToKeep: 601 res.append(idx) 602 else: 603 rej.append(idx) 604 return res, rej
605 606
607 -def CountResults(inData, col=-1, bounds=None):
608 """ #DOC 609 """ 610 counts = {} 611 for p in inData: 612 if not bounds: 613 r = p[col] 614 else: 615 act = p[col] 616 bound = 0 617 placed = 0 618 while not placed and bound < len(bounds): 619 if act < bounds[bound]: 620 r = bound 621 placed = 1 622 else: 623 bound += 1 624 if not placed: 625 r = bound 626 627 counts[r] = counts.get(r, 0) + 1 628 return counts
629 630
631 -def RandomizeActivities(dataSet, shuffle=0, runDetails=None):
632 """ randomizes the activity values of a dataset 633 634 **Arguments** 635 636 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized 637 638 - shuffle: an optional toggle. If this is set, the activity values 639 will be shuffled (so the number in each class remains constant) 640 641 - runDetails: an optional CompositeRun object 642 643 **Note** 644 645 - _examples_ are randomized in place 646 647 648 """ 649 nPts = dataSet.GetNPts() 650 if shuffle: 651 if runDetails: 652 runDetails.shuffled = 1 653 acts = dataSet.GetResults()[:] 654 # While the random argument is the default, removing it will cause the shuffle 655 # tests in UnitTestScreenComposite to fail. 656 random.shuffle(acts, random=random.random) 657 else: # This part of the code isn't working as examples is not defined 658 if runDetails: 659 runDetails.randomized = 1 660 nPossible = dataSet.GetNPossibleVals()[-1] 661 acts = [random.randint(0, nPossible) for _ in len(examples)] 662 for i in range(nPts): 663 tmp = dataSet[i] 664 tmp[-1] = acts[i] 665 dataSet[i] = tmp
666 667 668 # ------------------------------------ 669 # 670 # doctest boilerplate 671 #
672 -def _runDoctests(verbose=None): # pragma: nocover
673 import sys 674 import doctest 675 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose) 676 sys.exit(failed) 677 678 679 if __name__ == '__main__': # pragma: nocover 680 _runDoctests() 681