1
2
3
4
5
6
7
8
9
10 """ utility functionality for fingerprinting sets of molecules
11 includes a command line app for working with fingerprints
12 and databases
13
14
15 Sample Usage:
16
17 python FingerprintMols.py -d data.gdb \
18 -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID" \
19 --outTable="daylight_sig"
20
21
22 """
23
24 from __future__ import print_function
25
26 import getopt
27 import sys
28
29 from rdkit import Chem
30 from rdkit import DataStructs
31 from rdkit.Chem import MACCSkeys
32 from rdkit.ML.Cluster import Murtagh
33 from rdkit.six.moves import cPickle
34
35
38
39
42
43
48
49
51 nOn = fp.GetNumOnBits()
52 nTot = fp.GetNumBits()
53 while (float(nOn) / nTot < fpArgs['tgtDensity']):
54 if nTot / 2 > fpArgs['minSize']:
55 fp = DataStructs.FoldFingerprint(fp, 2)
56 nOn = fp.GetNumOnBits()
57 nTot = fp.GetNumBits()
58 else:
59 break
60 return fp
61
62
64 if not fpArgs:
65 details = FingerprinterDetails()
66 fpArgs = details.__dict__
67
68 if fingerprinter != Chem.RDKFingerprint:
69 fp = fingerprinter(mol, **fpArgs)
70 fp = FoldFingerprintToTargetDensity(fp, **fpArgs)
71 else:
72 fp = fingerprinter(mol, fpArgs['minPath'], fpArgs['maxPath'], fpArgs['fpSize'],
73 fpArgs['bitsPerHash'], fpArgs['useHs'], fpArgs['tgtDensity'],
74 fpArgs['minSize'])
75 return fp
76
77
80 """ fpArgs are passed as keyword arguments to the fingerprinter
81
82 Returns a list of 2-tuples: (ID,fp)
83
84 """
85 res = []
86 nDone = 0
87 for entry in dataSource:
88 ID, smi = str(entry[idCol]), str(entry[smiCol])
89 mol = Chem.MolFromSmiles(smi)
90 if mol is not None:
91 fp = FingerprintMol(mol, fingerprinter, **fpArgs)
92 res.append((ID, fp))
93 nDone += 1
94 if reportFreq > 0 and not nDone % reportFreq:
95 message('Done %d molecules\n' % (nDone))
96 if maxMols > 0 and nDone >= maxMols:
97 break
98 else:
99 error('Problems parsing SMILES: %s\n' % smi)
100 return res
101
102
105 """ fpArgs are passed as keyword arguments to the fingerprinter
106
107 Returns a list of 2-tuples: (ID,fp)
108
109 """
110 res = []
111 nDone = 0
112 for ID, mol in mols:
113 if mol:
114 fp = FingerprintMol(mol, fingerprinter, **fpArgs)
115 res.append((ID, fp))
116 nDone += 1
117 if reportFreq > 0 and not nDone % reportFreq:
118 message('Done %d molecules\n' % (nDone))
119 if maxMols > 0 and nDone >= maxMols:
120 break
121 else:
122 error('Problems parsing SMILES: %s\n' % smi)
123 return res
124
125
128 """ fpArgs are passed as keyword arguments to the fingerprinter
129
130 Returns a list of 2-tuples: (ID,fp)
131
132 """
133 res = []
134 nDone = 0
135 for entry in dataSource:
136 ID, pkl = str(entry[idCol]), str(entry[pklCol])
137 mol = Chem.Mol(pkl)
138 if mol is not None:
139 fp = FingerprintMol(mol, fingerprinter, **fpArgs)
140 res.append((ID, fp))
141 nDone += 1
142 if reportFreq > 0 and not nDone % reportFreq:
143 message('Done %d molecules\n' % (nDone))
144 if maxMols > 0 and nDone >= maxMols:
145 break
146 else:
147 error('Problems parsing pickle for ID: %s\n' % ID)
148 return res
149
150
152 data = None
153 if details.dbName and details.tableName:
154 from rdkit.Dbase.DbConnection import DbConnect
155 from rdkit.Dbase import DbInfo
156 from rdkit.ML.Data import DataUtils
157 try:
158 conn = DbConnect(details.dbName, details.tableName)
159 except Exception:
160 import traceback
161 error('Problems establishing connection to database: %s|%s\n' % (details.dbName,
162 details.tableName))
163 traceback.print_exc()
164 if not details.idName:
165 details.idName = DbInfo.GetColumnNames(details.dbName, details.tableName)[0]
166 dataSet = DataUtils.DBToData(details.dbName, details.tableName,
167 what='%s,%s' % (details.idName, details.smilesName))
168 idCol = 0
169 smiCol = 1
170 elif details.inFileName and details.useSmiles:
171 from rdkit.ML.Data import DataUtils
172 conn = None
173 if not details.idName:
174 details.idName = 'ID'
175 try:
176 dataSet = DataUtils.TextFileToData(details.inFileName,
177 onlyCols=[details.idName, details.smilesName])
178 except IOError:
179 import traceback
180 error('Problems reading from file %s\n' % (details.inFileName))
181 traceback.print_exc()
182
183 idCol = 0
184 smiCol = 1
185 elif details.inFileName and details.useSD:
186 conn = None
187 dataset = None
188 if not details.idName:
189 details.idName = 'ID'
190 dataSet = []
191 try:
192 s = Chem.SDMolSupplier(details.inFileName)
193 except Exception:
194 import traceback
195 error('Problems reading from file %s\n' % (details.inFileName))
196 traceback.print_exc()
197 else:
198 while 1:
199 try:
200 m = s.next()
201 except StopIteration:
202 break
203 if m:
204 dataSet.append(m)
205 if reportFreq > 0 and not len(dataSet) % reportFreq:
206 message('Read %d molecules\n' % (len(dataSet)))
207 if details.maxMols > 0 and len(dataSet) >= details.maxMols:
208 break
209
210 for i, mol in enumerate(dataSet):
211 if mol.HasProp(details.idName):
212 nm = mol.GetProp(details.idName)
213 else:
214 nm = mol.GetProp('_Name')
215 dataSet[i] = (nm, mol)
216 else:
217 dataSet = None
218
219 fps = None
220 if dataSet and not details.useSD:
221 data = dataSet.GetNamedData()
222 if not details.molPklName:
223 fps = apply(FingerprintsFromSmiles, (data, idCol, smiCol), details.__dict__)
224 else:
225 fps = apply(FingerprintsFromPickles, (data, idCol, smiCol), details.__dict__)
226 elif dataSet and details.useSD:
227 fps = apply(FingerprintsFromMols, (dataSet, ), details.__dict__)
228
229 if fps:
230 if details.outFileName:
231 outF = open(details.outFileName, 'wb+')
232 for i in range(len(fps)):
233 cPickle.dump(fps[i], outF)
234 outF.close()
235 dbName = details.outDbName or details.dbName
236 if details.outTableName and dbName:
237 from rdkit.Dbase.DbConnection import DbConnect
238 from rdkit.Dbase import DbUtils, DbModule
239 conn = DbConnect(dbName)
240
241
242
243
244 colTypes = DbUtils.TypeFinder(data, len(data), len(data[0]))
245 typeStrs = DbUtils.GetTypeStrings([details.idName, details.smilesName], colTypes,
246 keyCol=details.idName)
247 cols = '%s, %s %s' % (typeStrs[0], details.fpColName, DbModule.binaryTypeName)
248
249
250
251
252
253
254
255
256 if details.replaceTable or \
257 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
258 conn.AddTable(details.outTableName, cols)
259
260
261
262
263 for ID, fp in fps:
264 tpl = ID, DbModule.binaryHolder(fp.ToBinary())
265 conn.InsertData(details.outTableName, tpl)
266 conn.Commit()
267 return fps
268
269
270
271
272
273
274
276 """ class for storing the details of a fingerprinting run,
277 generates sensible defaults on construction
278
279 """
280
285
287 self.fingerprinter = Chem.RDKFingerprint
288 self.fpColName = "AutoFragmentFP"
289 self.idName = ''
290 self.dbName = ''
291 self.outDbName = ''
292 self.tableName = ''
293 self.minSize = 64
294 self.fpSize = 2048
295 self.tgtDensity = 0.3
296 self.minPath = 1
297 self.maxPath = 7
298 self.discrimHash = 0
299 self.useHs = 0
300 self.useValence = 0
301 self.bitsPerHash = 2
302 self.smilesName = 'SMILES'
303 self.maxMols = -1
304 self.outFileName = ''
305 self.outTableName = ''
306 self.inFileName = ''
307 self.replaceTable = True
308 self.molPklName = ''
309 self.useSmiles = True
310 self.useSD = False
311
313 self.metric = DataStructs.TanimotoSimilarity
314 self.doScreen = ''
315 self.topN = 10
316 self.screenThresh = 0.75
317 self.doThreshold = 0
318 self.smilesTableName = ''
319 self.probeSmiles = ''
320 self.probeMol = None
321 self.noPickle = 0
322
324 self.clusterAlgo = Murtagh.WARDS
325 self.actTableName = ''
326 self.actName = ''
327
339
348
349
351 """ prints a usage string and exits
352
353 """
354 print(_usageDoc)
355 sys.exit(-1)
356
357
358 _usageDoc = """
359 Usage: FingerprintMols.py [args] <fName>
360
361 If <fName> is provided and no tableName is specified (see below),
362 data will be read from the text file <fName>. Text files delimited
363 with either commas (extension .csv) or tabs (extension .txt) are
364 supported.
365
366 Command line arguments are:
367 - -d _dbName_: set the name of the database from which
368 to pull input molecule information. If output is
369 going to a database, this will also be used for that
370 unless the --outDbName option is used.
371
372 - -t _tableName_: set the name of the database table
373 from which to pull input molecule information
374
375 - --smilesName=val: sets the name of the SMILES column
376 in the input database. Default is *SMILES*.
377
378 - --useSD: Assume that the input file is an SD file, not a SMILES
379 table.
380
381 - --idName=val: sets the name of the id column in the input
382 database. Defaults to be the name of the first db column
383 (or *ID* for text files).
384
385 - -o _outFileName_: name of the output file (output will
386 be a pickle file with one label,fingerprint entry for each
387 molecule).
388
389 - --outTable=val: name of the output db table used to store
390 fingerprints. If this table already exists, it will be
391 replaced.
392
393 - --outDbName: name of output database, if it's being used.
394 Defaults to be the same as the input db.
395
396 - --fpColName=val: name to use for the column which stores
397 fingerprints (in pickled format) in the output db table.
398 Default is *AutoFragmentFP*
399
400 - --maxSize=val: base size of the fingerprints to be generated
401 Default is *2048*
402
403 - --minSize=val: minimum size of the fingerprints to be generated
404 (limits the amount of folding that happens). Default is *64*
405
406 - --density=val: target bit density in the fingerprint. The
407 fingerprint will be folded until this density is
408 reached. Default is *0.3*
409
410 - --minPath=val: minimum path length to be included in
411 fragment-based fingerprints. Default is *1*.
412
413 - --maxPath=val: maximum path length to be included in
414 fragment-based fingerprints. Default is *7*.
415
416 - --nBitsPerHash: number of bits to be set in the output
417 fingerprint for each fragment. Default is *2*.
418
419 - --discrim: use of path-based discriminators to hash bits.
420 Default is *false*.
421
422 - -V: include valence information in the fingerprints
423 Default is *false*.
424
425 - -H: include Hs in the fingerprint
426 Default is *false*.
427
428 - --maxMols=val: sets the maximum number of molecules to be
429 fingerprinted.
430
431 - --useMACCS: use the public MACCS keys to do the fingerprinting
432 (instead of a daylight-type fingerprint)
433
434 """
435
436
438 """ parses the command line arguments and returns a
439 _FingerprinterDetails_ instance with the results.
440
441 **Note**:
442
443 - If you make modifications here, please update the global
444 _usageDoc string so the Usage message is up to date.
445
446 - This routine is used by both the fingerprinter, the clusterer and the
447 screener; not all arguments make sense for all applications.
448
449 """
450 args = sys.argv[1:]
451 try:
452 args, extras = getopt.getopt(args,
453 'HVs:d:t:o:h',
454 [
455 'minSize=',
456 'maxSize=',
457 'density=',
458 'minPath=',
459 'maxPath=',
460 'bitsPerHash=',
461 'smilesName=',
462 'molPkl=',
463 'useSD',
464 'idName=',
465 'discrim',
466 'outTable=',
467 'outDbName=',
468 'fpColName=',
469 'maxMols=',
470 'useMACCS',
471 'keepTable',
472
473 'smilesTable=',
474 'doScreen=',
475 'topN=',
476 'thresh=',
477 'smiles=',
478 'dice',
479 'cosine',
480
481 'actTable=',
482 'actName=',
483 'SLINK',
484 'CLINK',
485 'UPGMA',
486 ])
487 except Exception:
488 import traceback
489 traceback.print_exc()
490 Usage()
491
492 if details is None:
493 details = FingerprinterDetails()
494 if len(extras):
495 details.inFileName = extras[0]
496
497 for arg, val in args:
498 if arg == '-H':
499 details.useHs = 1
500 elif arg == '-V':
501 details.useValence = 1
502 elif arg == '-d':
503 details.dbName = val
504 elif arg == '-t':
505 details.tableName = val
506 elif arg == '-o':
507 details.outFileName = val
508 elif arg == '--minSize':
509 details.minSize = int(val)
510 elif arg == '--maxSize':
511 details.fpSize = int(val)
512 elif arg == '--density':
513 details.tgtDensity = float(val)
514 elif arg == '--outTable':
515 details.outTableName = val
516 elif arg == '--outDbName':
517 details.outDbName = val
518 elif arg == '--fpColName':
519 details.fpColName = val
520 elif arg == '--minPath':
521 details.minPath = int(val)
522 elif arg == '--maxPath':
523 details.maxPath = int(val)
524 elif arg == '--nBitsPerHash':
525 details.bitsPerHash = int(val)
526 elif arg == '--discrim':
527 details.discrimHash = 1
528 elif arg == '--smilesName':
529 details.smilesName = val
530 elif arg == '--molPkl':
531 details.molPklName = val
532 elif arg == '--useSD':
533 details.useSmiles = False
534 details.useSD = True
535 elif arg == '--idName':
536 details.idName = val
537 elif arg == '--maxMols':
538 details.maxMols = int(val)
539 elif arg == '--useMACCS':
540 details.fingerprinter = MACCSkeys.GenMACCSKeys
541 elif arg == '--keepTable':
542 details.replaceTable = False
543
544
545 elif arg == '--smilesTable':
546 details.smilesTableName = val
547 elif arg == '--topN':
548 details.doThreshold = 0
549 details.topN = int(val)
550 elif arg == '--thresh':
551 details.doThreshold = 1
552 details.screenThresh = float(val)
553 elif arg == '--smiles':
554 details.probeSmiles = val
555 elif arg == '--dice':
556 details.metric = DataStructs.DiceSimilarity
557 elif arg == '--cosine':
558 details.metric = DataStructs.CosineSimilarity
559
560
561 elif arg == '--SLINK':
562 details.clusterAlgo = Murtagh.SLINK
563 elif arg == '--CLINK':
564 details.clusterAlgo = Murtagh.CLINK
565 elif arg == '--UPGMA':
566 details.clusterAlgo = Murtagh.UPGMA
567 elif arg == '--actTable':
568 details.actTableName = val
569 elif arg == '--actName':
570 details.actName = val
571 elif arg == '-h':
572 Usage()
573 return details
574
575
576 if __name__ == '__main__':
577 message("This is FingerprintMols\n\n")
578 details = ParseArgs()
579 FingerprintsFromDetails(details)
580