Package rdkit :: Package Chem :: Module BuildFragmentCatalog
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.BuildFragmentCatalog

  1  # 
  2  #  Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC 
  3  # 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  """  command line utility for working with FragmentCatalogs (CASE-type analysis) 
 11   
 12  **Usage** 
 13   
 14    BuildFragmentCatalog [optional args] <filename> 
 15   
 16   filename, the name of a delimited text file containing InData, is required 
 17   for some modes of operation (see below) 
 18   
 19  **Command Line Arguments** 
 20   
 21   - -n *maxNumMols*:  specify the maximum number of molecules to be processed 
 22   
 23   - -b: build the catalog and OnBitLists 
 24      *requires InData* 
 25   
 26   - -s: score compounds 
 27      *requires InData and a Catalog, can use OnBitLists* 
 28   
 29   - -g: calculate info gains 
 30      *requires Scores* 
 31   
 32   - -d: show details about high-ranking fragments 
 33      *requires a Catalog and Gains* 
 34   
 35   - --catalog=*filename*: filename with the pickled catalog. 
 36      If -b is provided, this file will be overwritten. 
 37   
 38   - --onbits=*filename*: filename to hold the pickled OnBitLists. 
 39     If -b is provided, this file will be overwritten 
 40   
 41   - --scores=*filename*: filename to hold the text score data. 
 42     If -s is provided, this file will be overwritten 
 43   
 44   - --gains=*filename*: filename to hold the text gains data. 
 45     If -g is provided, this file will be overwritten 
 46   
 47   - --details=*filename*: filename to hold the text details data. 
 48     If -d is provided, this file will be overwritten. 
 49   
 50   - --minPath=2: specify the minimum length for a path 
 51   
 52   - --maxPath=6: specify the maximum length for a path 
 53   
 54   - --smiCol=1: specify which column in the input data file contains 
 55       SMILES 
 56   
 57   - --actCol=-1: specify which column in the input data file contains 
 58       activities 
 59   
 60   - --nActs=2: specify the number of possible activity values 
 61   
 62   - --nBits=-1: specify the maximum number of bits to show details for 
 63   
 64  """ 
 65  from __future__ import print_function 
 66   
 67  import os 
 68  import sys 
 69   
 70  import numpy 
 71   
 72  from rdkit import RDConfig 
 73  from rdkit.Chem import FragmentCatalog 
 74  from rdkit.Dbase.DbConnection import DbConnect 
 75  from rdkit.ML import InfoTheory 
 76  from rdkit.six import next 
 77  from rdkit.six.moves import cPickle 
 78   
 79   
80 -def message(msg, dest=sys.stdout):
81 dest.write(msg)
82 83
84 -def BuildCatalog(suppl, maxPts=-1, groupFileName=None, minPath=2, maxPath=6, reportFreq=10):
85 """ builds a fragment catalog from a set of molecules in a delimited text block 86 87 **Arguments** 88 89 - suppl: a mol supplier 90 91 - maxPts: (optional) if provided, this will set an upper bound on the 92 number of points to be considered 93 94 - groupFileName: (optional) name of the file containing functional group 95 information 96 97 - minPath, maxPath: (optional) names of the minimum and maximum path lengths 98 to be considered 99 100 - reportFreq: (optional) how often to display status information 101 102 **Returns** 103 104 a FragmentCatalog 105 106 """ 107 if groupFileName is None: 108 groupFileName = os.path.join(RDConfig.RDDataDir, "FunctionalGroups.txt") 109 110 fpParams = FragmentCatalog.FragCatParams(minPath, maxPath, groupFileName) 111 catalog = FragmentCatalog.FragCatalog(fpParams) 112 fgen = FragmentCatalog.FragCatGenerator() 113 if maxPts > 0: 114 nPts = maxPts 115 else: 116 if hasattr(suppl, '__len__'): 117 nPts = len(suppl) 118 else: 119 nPts = -1 120 for i, mol in enumerate(suppl): 121 if i == nPts: 122 break 123 if i and not i % reportFreq: 124 if nPts > -1: 125 message('Done %d of %d, %d paths\n' % (i, nPts, catalog.GetFPLength())) 126 else: 127 message('Done %d, %d paths\n' % (i, catalog.GetFPLength())) 128 fgen.AddFragsFromMol(mol, catalog) 129 return catalog
130 131
132 -def ScoreMolecules(suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2, reportFreq=10):
133 """ scores the compounds in a supplier using a catalog 134 135 **Arguments** 136 137 - suppl: a mol supplier 138 139 - catalog: the FragmentCatalog 140 141 - maxPts: (optional) the maximum number of molecules to be 142 considered 143 144 - actName: (optional) the name of the molecule's activity property. 145 If this is not provided, the molecule's last property will be used. 146 147 - acts: (optional) a sequence of activity values (integers). 148 If not provided, the activities will be read from the molecules. 149 150 - nActs: (optional) number of possible activity values 151 152 - reportFreq: (optional) how often to display status information 153 154 **Returns** 155 156 a 2-tuple: 157 158 1) the results table (a 3D array of ints nBits x 2 x nActs) 159 160 2) a list containing the on bit lists for each molecule 161 162 """ 163 nBits = catalog.GetFPLength() 164 resTbl = numpy.zeros((nBits, 2, nActs), numpy.int) 165 obls = [] 166 167 if not actName and not acts: 168 actName = suppl[0].GetPropNames()[-1] 169 170 fpgen = FragmentCatalog.FragFPGenerator() 171 suppl.reset() 172 i = 1 173 for mol in suppl: 174 if i and not i % reportFreq: 175 message('Done %d.\n' % (i)) 176 if mol: 177 if not acts: 178 act = int(mol.GetProp(actName)) 179 else: 180 act = acts[i - 1] 181 fp = fpgen.GetFPForMol(mol, catalog) 182 obls.append([x for x in fp.GetOnBits()]) 183 for j in range(nBits): 184 resTbl[j, 0, act] += 1 185 for id_ in obls[i - 1]: 186 resTbl[id_ - 1, 0, act] -= 1 187 resTbl[id_ - 1, 1, act] += 1 188 else: 189 obls.append([]) 190 i += 1 191 return resTbl, obls
192 193
194 -def ScoreFromLists(bitLists, suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2, 195 reportFreq=10):
196 """ similar to _ScoreMolecules()_, but uses pre-calculated bit lists 197 for the molecules (this speeds things up a lot) 198 199 200 **Arguments** 201 202 - bitLists: sequence of on bit sequences for the input molecules 203 204 - suppl: the input supplier (we read activities from here) 205 206 - catalog: the FragmentCatalog 207 208 - maxPts: (optional) the maximum number of molecules to be 209 considered 210 211 - actName: (optional) the name of the molecule's activity property. 212 If this is not provided, the molecule's last property will be used. 213 214 - nActs: (optional) number of possible activity values 215 216 - reportFreq: (optional) how often to display status information 217 218 **Returns** 219 220 the results table (a 3D array of ints nBits x 2 x nActs) 221 222 """ 223 nBits = catalog.GetFPLength() 224 if maxPts > 0: 225 nPts = maxPts 226 else: 227 nPts = len(bitLists) 228 resTbl = numpy.zeros((nBits, 2, nActs), numpy.int) 229 if not actName and not acts: 230 actName = suppl[0].GetPropNames()[-1] 231 suppl.reset() 232 for i in range(1, nPts + 1): 233 mol = next(suppl) 234 if not acts: 235 act = int(mol.GetProp(actName)) 236 else: 237 act = acts[i - 1] 238 if i and not i % reportFreq: 239 message('Done %d of %d\n' % (i, nPts)) 240 ids = set() 241 for id_ in bitLists[i - 1]: 242 ids.add(id_ - 1) 243 for j in range(nBits): 244 resTbl[j, 0, act] += 1 245 for id_ in ids: 246 resTbl[id_, 0, act] -= 1 247 resTbl[id_, 1, act] += 1 248 return resTbl
249 250
251 -def CalcGains(suppl, catalog, topN=-1, actName='', acts=None, nActs=2, reportFreq=10, biasList=None, 252 collectFps=0):
253 """ calculates info gains by constructing fingerprints 254 *DOC* 255 256 Returns a 2-tuple: 257 1) gains matrix 258 2) list of fingerprints 259 260 """ 261 nBits = catalog.GetFPLength() 262 if topN < 0: 263 topN = nBits 264 if not actName and not acts: 265 actName = suppl[0].GetPropNames()[-1] 266 267 if hasattr(suppl, '__len__'): 268 nMols = len(suppl) 269 else: 270 nMols = -1 271 fpgen = FragmentCatalog.FragFPGenerator() 272 # ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) 273 if biasList: 274 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.BIASENTROPY) 275 ranker.SetBiasList(biasList) 276 else: 277 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.ENTROPY) 278 i = 0 279 fps = [] 280 for mol in suppl: 281 if not acts: 282 try: 283 act = int(mol.GetProp(actName)) 284 except KeyError: 285 message('ERROR: Molecule has no property: %s\n' % (actName)) 286 message('\tAvailable properties are: %s\n' % (str(mol.GetPropNames()))) 287 raise KeyError(actName) 288 else: 289 act = acts[i] 290 if i and not i % reportFreq: 291 if nMols > 0: 292 message('Done %d of %d.\n' % (i, nMols)) 293 else: 294 message('Done %d.\n' % (i)) 295 fp = fpgen.GetFPForMol(mol, catalog) 296 ranker.AccumulateVotes(fp, act) 297 i += 1 298 if collectFps: 299 fps.append(fp) 300 gains = ranker.GetTopN(topN) 301 return gains, fps
302 303
304 -def CalcGainsFromFps(suppl, fps, topN=-1, actName='', acts=None, nActs=2, reportFreq=10, 305 biasList=None):
306 """ calculates info gains from a set of fingerprints 307 308 *DOC* 309 310 """ 311 nBits = len(fps[0]) 312 if topN < 0: 313 topN = nBits 314 if not actName and not acts: 315 actName = suppl[0].GetPropNames()[-1] 316 317 if hasattr(suppl, '__len__'): 318 nMols = len(suppl) 319 else: 320 nMols = -1 321 if biasList: 322 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.BIASENTROPY) 323 ranker.SetBiasList(biasList) 324 else: 325 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.ENTROPY) 326 for i, mol in enumerate(suppl): 327 if not acts: 328 try: 329 act = int(mol.GetProp(actName)) 330 except KeyError: 331 message('ERROR: Molecule has no property: %s\n' % (actName)) 332 message('\tAvailable properties are: %s\n' % (str(mol.GetPropNames()))) 333 raise KeyError(actName) 334 else: 335 act = acts[i] 336 if i and not i % reportFreq: 337 if nMols > 0: 338 message('Done %d of %d.\n' % (i, nMols)) 339 else: 340 message('Done %d.\n' % (i)) 341 fp = fps[i] 342 ranker.AccumulateVotes(fp, act) 343 gains = ranker.GetTopN(topN) 344 return gains
345 346
347 -def OutputGainsData(outF, gains, cat, nActs=2):
348 actHeaders = ['Act-%d' % (x) for x in range(nActs)] 349 if cat: 350 outF.write('id,Description,Gain,%s\n' % (','.join(actHeaders))) 351 else: 352 outF.write('id,Gain,%s\n' % (','.join(actHeaders))) 353 for entry in gains: 354 id_ = int(entry[0]) 355 outL = [str(id_)] 356 if cat: 357 descr = cat.GetBitDescription(id_) 358 outL.append(descr) 359 outL.append('%.6f' % entry[1]) 360 outL += ['%d' % x for x in entry[2:]] 361 outF.write(','.join(outL)) 362 outF.write('\n')
363 364
365 -def ProcessGainsData(inF, delim=',', idCol=0, gainCol=1):
366 """ reads a list of ids and info gains out of an input file 367 368 """ 369 res = [] 370 _ = inF.readline() 371 for line in inF: 372 splitL = line.strip().split(delim) 373 res.append((splitL[idCol], float(splitL[gainCol]))) 374 return res
375 376
377 -def ShowDetails(catalog, gains, nToDo=-1, outF=sys.stdout, idCol=0, gainCol=1, outDelim=','):
378 """ 379 gains should be a sequence of sequences. The idCol entry of each 380 sub-sequence should be a catalog ID. _ProcessGainsData()_ provides 381 suitable input. 382 383 """ 384 if nToDo < 0: 385 nToDo = len(gains) 386 for i in range(nToDo): 387 id_ = int(gains[i][idCol]) 388 gain = float(gains[i][gainCol]) 389 descr = catalog.GetFragDescription(id_) 390 if descr: 391 outF.write('%s\n' % (outDelim.join((str(id_), descr, str(gain)))))
392 393
394 -def SupplierFromDetails(details):
395 from rdkit.VLib.NodeLib.DbMolSupply import DbMolSupplyNode 396 from rdkit.VLib.NodeLib.SmilesSupply import SmilesSupplyNode 397 398 if details.dbName: 399 conn = DbConnect(details.dbName, details.tableName) 400 suppl = DbMolSupplyNode(conn.GetData()) 401 else: 402 suppl = SmilesSupplyNode(details.inFileName, delim=details.delim, nameColumn=details.nameCol, 403 smilesColumn=details.smiCol, titleLine=details.hasTitle) 404 if isinstance(details.actCol, int): 405 suppl.reset() 406 m = next(suppl) 407 actName = m.GetPropNames()[details.actCol] 408 details.actCol = actName 409 if isinstance(details.nameCol, int): 410 suppl.reset() 411 m = next(suppl) 412 nameName = m.GetPropNames()[details.nameCol] 413 details.nameCol = nameName 414 suppl.reset() 415 if isinstance(details.actCol, int): 416 suppl.reset() 417 m = next(suppl) 418 actName = m.GetPropNames()[details.actCol] 419 details.actCol = actName 420 if isinstance(details.nameCol, int): 421 suppl.reset() 422 m = next(suppl) 423 nameName = m.GetPropNames()[details.nameCol] 424 details.nameCol = nameName 425 suppl.reset() 426 return suppl
427 428
429 -def Usage():
430 print("This is BuildFragmentCatalog") 431 print('usage error') 432 # print(__doc__) 433 sys.exit(-1)
434 435
436 -class RunDetails(object):
437 numMols = -1 438 doBuild = 0 439 doSigs = 0 440 doScore = 0 441 doGains = 0 442 doDetails = 0 443 catalogName = None 444 onBitsName = None 445 scoresName = None 446 gainsName = None 447 dbName = '' 448 tableName = None 449 detailsName = None 450 inFileName = None 451 fpName = None 452 minPath = 2 453 maxPath = 6 454 smiCol = 1 455 actCol = -1 456 nameCol = -1 457 hasTitle = 1 458 nActs = 2 459 nBits = -1 460 delim = ',' 461 biasList = None 462 topN = -1
463 464
465 -def ParseArgs(details):
466 import getopt 467 try: 468 args, extras = getopt.getopt(sys.argv[1:], 'n:d:cst', 469 ['catalog=', 'onbits=', 'scoresFile=', 'gainsFile=', 470 'detailsFile=', 'fpFile=', 'minPath=', 'maxPath=', 'smiCol=', 471 'actCol=', 'nameCol=', 'nActs=', 'nBits=', 'biasList=', 'topN=', 472 'build', 'sigs', 'gains', 'details', 'score', 'noTitle']) 473 except Exception: 474 sys.stderr.write('Error parsing command line:\n') 475 import traceback 476 traceback.print_exc() 477 Usage() 478 for arg, val in args: 479 if arg == '-n': 480 details.numMols = int(val) 481 elif arg == '-c': 482 details.delim = ',' 483 elif arg == '-s': 484 details.delim = ' ' 485 elif arg == '-t': 486 details.delim = '\t' 487 elif arg == '-d': 488 details.dbName = val 489 elif arg == '--build': 490 details.doBuild = 1 491 elif arg == '--score': 492 details.doScore = 1 493 elif arg == '--gains': 494 details.doGains = 1 495 elif arg == '--sigs': 496 details.doSigs = 1 497 elif arg == '-details': 498 details.doDetails = 1 499 elif arg == '--catalog': 500 details.catalogName = val 501 elif arg == '--onbits': 502 details.onBitsName = val 503 elif arg == '--scoresFile': 504 details.scoresName = val 505 elif arg == '--gainsFile': 506 details.gainsName = val 507 elif arg == '--detailsFile': 508 details.detailsName = val 509 elif arg == '--fpFile': 510 details.fpName = val 511 elif arg == '--minPath': 512 details.minPath = int(val) 513 elif arg == '--maxPath': 514 details.maxPath = int(val) 515 elif arg == '--smiCol': 516 try: 517 details.smiCol = int(val) 518 except ValueError: 519 details.smiCol = val 520 elif arg == '--actCol': 521 try: 522 details.actCol = int(val) 523 except ValueError: 524 details.actCol = val 525 elif arg == '--nameCol': 526 try: 527 details.nameCol = int(val) 528 except ValueError: 529 details.nameCol = val 530 elif arg == '--nActs': 531 details.nActs = int(val) 532 elif arg == '--nBits': 533 details.nBits = int(val) 534 elif arg == '--noTitle': 535 details.hasTitle = 0 536 elif arg == '--biasList': 537 details.biasList = tuple(eval(val)) 538 elif arg == '--topN': 539 details.topN = int(val) 540 elif arg == '-h': 541 Usage() 542 sys.exit(0) 543 else: 544 Usage() 545 if len(extras): 546 if details.dbName: 547 details.tableName = extras[0] 548 else: 549 details.inFileName = extras[0] 550 else: 551 Usage()
552 553 554 if __name__ == '__main__': 555 import time 556 details = RunDetails() 557 ParseArgs(details) 558 from io import StringIO 559 suppl = SupplierFromDetails(details) 560 561 cat = None 562 obls = None 563 if details.doBuild: 564 if not suppl: 565 message("We require inData to generate a catalog\n") 566 sys.exit(-2) 567 message("Building catalog\n") 568 t1 = time.time() 569 cat = BuildCatalog(suppl, maxPts=details.numMols, minPath=details.minPath, 570 maxPath=details.maxPath) 571 t2 = time.time() 572 message("\tThat took %.2f seconds.\n" % (t2 - t1)) 573 if details.catalogName: 574 message("Dumping catalog data\n") 575 cPickle.dump(cat, open(details.catalogName, 'wb+')) 576 elif details.catalogName: 577 message("Loading catalog\n") 578 cat = cPickle.load(open(details.catalogName, 'rb')) 579 if details.onBitsName: 580 try: 581 obls = cPickle.load(open(details.onBitsName, 'rb')) 582 except Exception: 583 obls = None 584 else: 585 if len(obls) < (inD.count('\n') - 1): 586 obls = None 587 scores = None 588 if details.doScore: 589 if not suppl: 590 message("We require inData to score molecules\n") 591 sys.exit(-2) 592 if not cat: 593 message("We require a catalog to score molecules\n") 594 sys.exit(-2) 595 message("Scoring compounds\n") 596 if not obls or len(obls) < details.numMols: 597 scores, obls = ScoreMolecules(suppl, cat, maxPts=details.numMols, actName=details.actCol, 598 nActs=details.nActs) 599 if details.scoresName: 600 cPickle.dump(scores, open(details.scoresName, 'wb+')) 601 if details.onBitsName: 602 cPickle.dump(obls, open(details.onBitsName, 'wb+')) 603 else: 604 scores = ScoreFromLists(obls, suppl, cat, maxPts=details.numMols, actName=details.actCol, 605 nActs=details.nActs) 606 elif details.scoresName: 607 scores = cPickle.load(open(details.scoresName, 'rb')) 608 609 if details.fpName and os.path.exists(details.fpName) and not details.doSigs: 610 message("Reading fingerprints from file.\n") 611 fps = cPickle.load(open(details.fpName, 'rb')) 612 else: 613 fps = [] 614 gains = None 615 if details.doGains: 616 if not suppl: 617 message("We require inData to calculate gains\n") 618 sys.exit(-2) 619 if not (cat or fps): 620 message("We require either a catalog or fingerprints to calculate gains\n") 621 sys.exit(-2) 622 message("Calculating Gains\n") 623 t1 = time.time() 624 if details.fpName: 625 collectFps = 1 626 else: 627 collectFps = 0 628 if not fps: 629 gains, fps = CalcGains(suppl, cat, topN=details.topN, actName=details.actCol, 630 nActs=details.nActs, biasList=details.biasList, collectFps=collectFps) 631 if details.fpName: 632 message("Writing fingerprint file.\n") 633 tmpF = open(details.fpName, 'wb+') 634 cPickle.dump(fps, tmpF, 1) 635 tmpF.close() 636 else: 637 gains = CalcGainsFromFps(suppl, fps, topN=details.topN, actName=details.actCol, 638 nActs=details.nActs, biasList=details.biasList) 639 t2 = time.time() 640 message("\tThat took %.2f seconds.\n" % (t2 - t1)) 641 if details.gainsName: 642 outF = open(details.gainsName, 'w+') 643 OutputGainsData(outF, gains, cat, nActs=details.nActs) 644 else: 645 if details.gainsName: 646 inF = open(details.gainsName, 'r') 647 gains = ProcessGainsData(inF) 648 649 if details.doDetails: 650 if not cat: 651 message("We require a catalog to get details\n") 652 sys.exit(-2) 653 if not gains: 654 message("We require gains data to get details\n") 655 sys.exit(-2) 656 io = StringIO() 657 io.write('id,SMILES,gain\n') 658 ShowDetails(cat, gains, nToDo=details.nBits, outF=io) 659 if details.detailsName: 660 open(details.detailsName, 'w+').write(io.getvalue()) 661 else: 662 sys.stderr.write(io.getvalue()) 663