Package rdkit :: Package Chem :: Package Fingerprints :: Module MolSimilarity
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.MolSimilarity

  1  # 
  2  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  3  # 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  """ utility functionality for molecular similarity 
 11   includes a command line app for screening databases 
 12   
 13   
 14  Sample Usage: 
 15   
 16    python MolSimilarity.py  -d data.gdb -t daylight_sig --idName="Mol_ID" \ 
 17        --topN=100 --smiles='c1(C=O)ccc(Oc2ccccc2)cc1' --smilesTable=raw_dop_data \ 
 18        --smilesName="structure" -o results.csv 
 19   
 20  """ 
 21  import types 
 22   
 23  from rdkit import Chem 
 24  from rdkit import DataStructs 
 25  from rdkit.Chem.Fingerprints import FingerprintMols, DbFpSupplier 
 26  from rdkit.DataStructs.TopNContainer import TopNContainer 
 27  from rdkit.Dbase import DbModule 
 28  from rdkit.Dbase.DbConnection import DbConnect 
 29  from rdkit.six.moves import cPickle 
 30   
 31  try: 
 32    from rdkit.VLib.NodeLib.DbPickleSupplier import _lazyDataSeq as _dataSeq 
 33  except ImportError: 
 34    _dataSeq = None 
 35   
 36   
37 -def _ConstructSQL(details, extraFields=''):
38 fields = '%s.%s' % (details.tableName, details.idName) 39 join = '' 40 if details.smilesTableName: 41 if details.smilesName: 42 fields = fields + ',%s' % (details.smilesName) 43 join = 'join %s smi on smi.%s=%s.%s' % (details.smilesTableName, details.idName, 44 details.tableName, details.idName) 45 if details.actTableName: 46 if details.actName: 47 fields = fields + ',%s' % (details.actName) 48 join = join + 'join %s act on act.%s=%s.%s' % (details.actTableName, details.idName, 49 details.tableName, details.idName) 50 # data = conn.GetData(fields=fields,join=join) 51 if extraFields: 52 fields += ',' + extraFields 53 cmd = 'select %s from %s %s' % (fields, details.tableName, join) 54 return cmd
55 56
57 -def ScreenInDb(details, mol):
58 try: 59 probeFp = apply(FingerprintMols.FingerprintMol, (mol, ), details.__dict__) 60 except Exception: 61 import traceback 62 FingerprintMols.error('Error: problems fingerprinting molecule.\n') 63 traceback.print_exc() 64 return [] 65 if details.dbName and details.tableName: 66 try: 67 conn = DbConnect(details.dbName, details.tableName) 68 if hasattr(details, 'dbUser'): 69 conn.user = details.dbUser 70 if hasattr(details, 'dbPassword'): 71 conn.password = details.dbPassword 72 except Exception: 73 import traceback 74 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n' % 75 (details.dbName, details.tableName)) 76 traceback.print_exc() 77 78 if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, 79 DataStructs.CosineSimilarity): 80 data = GetFingerprints(details) 81 res = ScreenFingerprints(details, data, mol) 82 else: 83 res = [] 84 if details.metric == DataStructs.TanimotoSimilarity: 85 func = 'rd_tanimoto' 86 pkl = probeFp.ToBitString() 87 elif details.metric == DataStructs.DiceSimilarity: 88 func = 'rd_dice' 89 pkl = probeFp.ToBitString() 90 elif details.metric == DataStructs.CosineSimilarity: 91 func = 'rd_cosine' 92 pkl = probeFp.ToBitString() 93 extraFields = "%s(%s,%s) as tani" % (func, DbModule.placeHolder, details.fpColName) 94 cmd = _ConstructSQL(details, extraFields=extraFields) 95 96 if details.doThreshold: 97 # we need to do a subquery here: 98 cmd = "select * from (%s) tmp where tani>%f" % (cmd, details.screenThresh) 99 cmd += " order by tani desc" 100 if not details.doThreshold and details.topN > 0: 101 cmd += " limit %d" % details.topN 102 curs = conn.GetCursor() 103 curs.execute(cmd, (pkl, )) 104 res = curs.fetchall() 105 106 return res
107 108
109 -def GetFingerprints(details):
110 """ returns an iterable sequence of fingerprints 111 each fingerprint will have a _fieldsFromDb member whose first entry is 112 the id. 113 114 """ 115 if details.dbName and details.tableName: 116 try: 117 conn = DbConnect(details.dbName, details.tableName) 118 if hasattr(details, 'dbUser'): 119 conn.user = details.dbUser 120 if hasattr(details, 'dbPassword'): 121 conn.password = details.dbPassword 122 except Exception: 123 import traceback 124 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n' % 125 (details.dbName, details.tableName)) 126 traceback.print_exc() 127 cmd = _ConstructSQL(details, extraFields=details.fpColName) 128 curs = conn.GetCursor() 129 # curs.execute(cmd) 130 # print 'CURSOR:',curs,curs.closed 131 if _dataSeq: 132 suppl = _dataSeq(curs, cmd, depickle=not details.noPickle, klass=DataStructs.ExplicitBitVect) 133 _dataSeq._conn = conn 134 else: 135 suppl = DbFpSupplier.ForwardDbFpSupplier(data, fpColName=details.fpColName) 136 elif details.inFileName: 137 conn = None 138 try: 139 inF = open(details.inFileName, 'r') 140 except IOError: 141 import traceback 142 FingerprintMols.error('Error: Problems reading from file %s\n' % (details.inFileName)) 143 traceback.print_exc() 144 145 suppl = [] 146 done = 0 147 while not done: 148 try: 149 ID, fp = cPickle.load(inF) 150 except Exception: 151 done = 1 152 else: 153 fp._fieldsFromDb = [ID] 154 suppl.append(fp) 155 else: 156 suppl = None 157 158 return suppl
159 160
161 -def ScreenFingerprints(details, data, mol=None, probeFp=None):
162 """ Returns a list of results 163 164 """ 165 if probeFp is None: 166 try: 167 probeFp = apply(FingerprintMols.FingerprintMol, (mol, ), details.__dict__) 168 except Exception: 169 import traceback 170 FingerprintMols.error('Error: problems fingerprinting molecule.\n') 171 traceback.print_exc() 172 return [] 173 if not probeFp: 174 return [] 175 176 res = [] 177 if not details.doThreshold and details.topN > 0: 178 topN = TopNContainer(details.topN) 179 else: 180 topN = [] 181 res = [] 182 count = 0 183 for pt in data: 184 fp1 = probeFp 185 if not details.noPickle: 186 if type(pt) in (types.TupleType, types.ListType): 187 ID, fp = pt 188 else: 189 fp = pt 190 ID = pt._fieldsFromDb[0] 191 score = DataStructs.FingerprintSimilarity(fp1, fp, details.metric) 192 else: 193 ID, pkl = pt 194 score = details.metric(fp1, str(pkl)) 195 if topN: 196 topN.Insert(score, ID) 197 elif not details.doThreshold or \ 198 (details.doThreshold and score >= details.screenThresh): 199 res.append((ID, score)) 200 count += 1 201 if hasattr(details, 'stopAfter') and count >= details.stopAfter: 202 break 203 for score, ID in topN: 204 res.append((ID, score)) 205 206 return res
207 208
209 -def ScreenFromDetails(details, mol=None):
210 """ Returns a list of results 211 212 """ 213 if not mol: 214 if not details.probeMol: 215 smi = details.probeSmiles 216 try: 217 mol = Chem.MolFromSmiles(smi) 218 except Exception: 219 import traceback 220 FingerprintMols.error('Error: problems generating molecule for smiles: %s\n' % (smi)) 221 traceback.print_exc() 222 return 223 else: 224 mol = details.probeMol 225 if not mol: 226 return 227 228 if details.outFileName: 229 try: 230 outF = open(details.outFileName, 'w+') 231 except IOError: 232 FingerprintMols.error("Error: could not open output file %s for writing\n" % 233 (details.outFileName)) 234 return None 235 else: 236 outF = None 237 238 if not hasattr(details, 'useDbSimilarity') or not details.useDbSimilarity: 239 data = GetFingerprints(details) 240 res = ScreenFingerprints(details, data, mol) 241 else: 242 res = ScreenInDb(details, mol) 243 if outF: 244 for pt in res: 245 outF.write(','.join([str(x) for x in pt])) 246 outF.write('\n') 247 return res
248 249 250 _usageDoc = """ 251 Usage: MolSimilarity.py [args] <fName> 252 253 If <fName> is provided and no tableName is specified (see below), 254 data will be read from the pickled file <fName>. This file should 255 contain a series of pickled (ID,fingerprint) tuples. 256 257 NOTE: at the moment the user is responsible for ensuring that the 258 fingerprint parameters given at run time (used to fingerprint the 259 probe molecule) match those used to generate the input fingerprints. 260 261 Command line arguments are: 262 - --smiles=val: sets the SMILES for the input molecule. This is 263 a required argument. 264 265 - -d _dbName_: set the name of the database from which 266 to pull input fingerprint information. 267 268 - -t _tableName_: set the name of the database table 269 from which to pull input fingerprint information 270 271 - --smilesTable=val: sets the name of the database table 272 which contains SMILES for the input fingerprints. If this 273 information is provided along with smilesName (see below), 274 the output file will contain SMILES data 275 276 - --smilesName=val: sets the name of the SMILES column 277 in the input database. Default is *SMILES*. 278 279 - --topN=val: sets the number of results to return. 280 Default is *10*. 281 282 - --thresh=val: sets the similarity threshold. 283 284 - --idName=val: sets the name of the id column in the input 285 database. Default is *ID*. 286 287 - -o _outFileName_: name of the output file (output will 288 be a CSV file with one line for each of the output molecules 289 290 - --dice: use the DICE similarity metric instead of Tanimoto 291 292 - --cosine: use the cosine similarity metric instead of Tanimoto 293 294 - --fpColName=val: name to use for the column which stores 295 fingerprints (in pickled format) in the output db table. 296 Default is *AutoFragmentFP* 297 298 - --minPath=val: minimum path length to be included in 299 fragment-based fingerprints. Default is *1*. 300 301 - --maxPath=val: maximum path length to be included in 302 fragment-based fingerprints. Default is *7*. 303 304 - --nBitsPerHash: number of bits to be set in the output 305 fingerprint for each fragment. Default is *4*. 306 307 - --discrim: use of path-based discriminators to hash bits. 308 Default is *false*. 309 310 - -V: include valence information in the fingerprints 311 Default is *false*. 312 313 - -H: include Hs in the fingerprint 314 Default is *false*. 315 316 - --useMACCS: use the public MACCS keys to do the fingerprinting 317 (instead of a daylight-type fingerprint) 318 319 320 """ 321 if __name__ == '__main__': 322 FingerprintMols.message("This is MolSimilarity\n\n") 323 FingerprintMols._usageDoc = _usageDoc 324 details = FingerprintMols.ParseArgs() 325 ScreenFromDetails(details) 326