Package rdkit :: Package SimDivFilters :: Module SimilarityPickers
[hide private]
[frames] | no frames]

Source Code for Module rdkit.SimDivFilters.SimilarityPickers

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC 
  4  #  All Rights Reserved 
  5  # 
  6  from __future__ import print_function 
  7   
  8  import bisect 
  9   
 10  from rdkit import DataStructs 
 11  from rdkit.DataStructs.TopNContainer import TopNContainer 
 12   
 13   
14 -class GenericPicker(object):
15 _picks = None 16
17 - def MakePicks(self, force=False):
18 raise NotImplementedError("GenericPicker is a virtual base class")
19
20 - def __len__(self):
21 if self._picks is None: 22 self.MakePicks() 23 return len(self._picks)
24
25 - def __getitem__(self, which):
26 if self._picks is None: 27 self.MakePicks() 28 return self._picks[which]
29 30
31 -class TopNOverallPicker(GenericPicker):
32 """ A class for picking the top N overall best matches across a library 33 34 Connect to a database and build molecules: 35 >>> from rdkit import Chem 36 >>> from rdkit import RDConfig 37 >>> import os.path 38 >>> from rdkit.Dbase.DbConnection import DbConnect 39 >>> dbName = RDConfig.RDTestDatabase 40 >>> conn = DbConnect(dbName,'simple_mols1') 41 >>> [x.upper() for x in conn.GetColumnNames()] 42 ['SMILES', 'ID'] 43 >>> mols = [] 44 >>> for smi,id in conn.GetData(): 45 ... mol = Chem.MolFromSmiles(str(smi)) 46 ... mol.SetProp('_Name',str(id)) 47 ... mols.append(mol) 48 >>> len(mols) 49 12 50 51 Calculate fingerprints: 52 >>> probefps = [] 53 >>> for mol in mols: 54 ... fp = Chem.RDKFingerprint(mol) 55 ... fp._id = mol.GetProp('_Name') 56 ... probefps.append(fp) 57 58 Start by finding the top matches for a single probe. This ether should pull 59 other ethers from the db: 60 >>> mol = Chem.MolFromSmiles('COC') 61 >>> probeFp = Chem.RDKFingerprint(mol) 62 >>> picker = TopNOverallPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps) 63 >>> len(picker) 64 2 65 >>> fp,score = picker[0] 66 >>> id = fp._id 67 >>> str(id) 68 'ether-1' 69 >>> score 70 1.0 71 72 The results come back in order: 73 >>> fp,score = picker[1] 74 >>> id = fp._id 75 >>> str(id) 76 'ether-2' 77 78 Now find the top matches for 2 probes. We'll get one ether and one acid: 79 >>> fps = [] 80 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC'))) 81 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O'))) 82 >>> picker = TopNOverallPicker(numToPick=3,probeFps=fps,dataSet=probefps) 83 >>> len(picker) 84 3 85 >>> fp,score = picker[0] 86 >>> id = fp._id 87 >>> str(id) 88 'acid-1' 89 >>> fp,score = picker[1] 90 >>> id = fp._id 91 >>> str(id) 92 'ether-1' 93 >>> score 94 1.0 95 >>> fp,score = picker[2] 96 >>> id = fp._id 97 >>> str(id) 98 'acid-2' 99 100 """ 101
102 - def __init__(self, numToPick=10, probeFps=None, dataSet=None, 103 simMetric=DataStructs.TanimotoSimilarity):
104 """ 105 106 dataSet should be a sequence of BitVectors 107 108 """ 109 self.numToPick = numToPick 110 self.probes = probeFps 111 self.data = dataSet 112 self.simMetric = simMetric 113 self._picks = None
114
115 - def MakePicks(self, force=False):
116 if self._picks is not None and not force: 117 return 118 picks = TopNContainer(self.numToPick) 119 for fp in self.data: 120 origFp = fp 121 bestScore = -1.0 122 for probeFp in self.probes: 123 score = DataStructs.FingerprintSimilarity(origFp, probeFp, self.simMetric) 124 bestScore = max(score, bestScore) 125 picks.Insert(bestScore, fp) 126 self._picks = [] 127 for score, pt in picks: 128 self._picks.append((pt, score)) 129 self._picks.reverse()
130 131
132 -class SpreadPicker(GenericPicker):
133 """ A class for picking the best matches across a library 134 135 Connect to a database: 136 >>> from rdkit import Chem 137 >>> from rdkit import RDConfig 138 >>> import os.path 139 >>> from rdkit.Dbase.DbConnection import DbConnect 140 >>> dbName = RDConfig.RDTestDatabase 141 >>> conn = DbConnect(dbName,'simple_mols1') 142 >>> [x.upper() for x in conn.GetColumnNames()] 143 ['SMILES', 'ID'] 144 >>> mols = [] 145 >>> for smi,id in conn.GetData(): 146 ... mol = Chem.MolFromSmiles(str(smi)) 147 ... mol.SetProp('_Name',str(id)) 148 ... mols.append(mol) 149 >>> len(mols) 150 12 151 152 Calculate fingerprints: 153 >>> probefps = [] 154 >>> for mol in mols: 155 ... fp = Chem.RDKFingerprint(mol) 156 ... fp._id = mol.GetProp('_Name') 157 ... probefps.append(fp) 158 159 Start by finding the top matches for a single probe. This ether should pull 160 other ethers from the db: 161 >>> mol = Chem.MolFromSmiles('COC') 162 >>> probeFp = Chem.RDKFingerprint(mol) 163 >>> picker = SpreadPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps) 164 >>> len(picker) 165 2 166 >>> fp,score = picker[0] 167 >>> id = fp._id 168 >>> str(id) 169 'ether-1' 170 >>> score 171 1.0 172 173 The results come back in order: 174 >>> fp,score = picker[1] 175 >>> id = fp._id 176 >>> str(id) 177 'ether-2' 178 179 Now find the top matches for 2 probes. We'll get one ether and one acid: 180 >>> fps = [] 181 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC'))) 182 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O'))) 183 >>> picker = SpreadPicker(numToPick=3,probeFps=fps,dataSet=probefps) 184 >>> len(picker) 185 3 186 >>> fp,score = picker[0] 187 >>> id = fp._id 188 >>> str(id) 189 'ether-1' 190 >>> score 191 1.0 192 >>> fp,score = picker[1] 193 >>> id = fp._id 194 >>> str(id) 195 'acid-1' 196 >>> score 197 1.0 198 >>> fp,score = picker[2] 199 >>> id = fp._id 200 >>> str(id) 201 'ether-2' 202 203 """ 204
205 - def __init__(self, numToPick=10, probeFps=None, dataSet=None, 206 simMetric=DataStructs.TanimotoSimilarity, expectPickles=True, onlyNames=False):
207 """ 208 209 dataSet should be a sequence of BitVectors or, if expectPickles 210 is False, a set of strings that can be converted to bit vectors 211 212 """ 213 self.numToPick = numToPick 214 self.probes = probeFps 215 self.data = dataSet 216 self.simMetric = simMetric 217 self.expectPickles = expectPickles 218 self.onlyNames = onlyNames 219 220 self._picks = None
221
222 - def MakePicks(self, force=False, silent=False):
223 if self._picks is not None and not force: 224 return 225 226 # start by getting the NxM score matrix 227 # (N=num probes, M=num fps) 228 nProbes = len(self.probes) 229 scores = [None] * nProbes 230 for i in range(nProbes): 231 scores[i] = [] 232 j = 0 233 fps = [] 234 for origFp in self.data: 235 for i in range(nProbes): 236 score = DataStructs.FingerprintSimilarity(self.probes[i], origFp, self.simMetric) 237 bisect.insort(scores[i], (score, j)) 238 if len(scores[i]) >= self.numToPick: 239 del scores[self.numToPick:] 240 if self.onlyNames and hasattr(origFp, '_fieldsFromDb'): 241 fps.append(origFp._fieldsFromDb[0]) 242 else: 243 fps.append(origFp) 244 j += 1 245 if not silent and not j % 1000: 246 print('scored %d fps' % j) 247 248 # now go probe by probe and select the current top entry until we are finished: 249 nPicked = 0 250 self._picks = [] 251 taken = [0] * len(fps) 252 while nPicked < self.numToPick: 253 rowIdx = nPicked % len(scores) 254 row = scores[rowIdx] 255 score, idx = row.pop() 256 # make sure we haven't taken this one already (from another row): 257 while taken[idx] and len(row): 258 score, idx = row.pop() 259 if not taken[idx]: 260 fp = fps[idx] 261 self._picks.append((fp, score)) 262 taken[idx] = 1 263 nPicked += 1
264 265 266 # ------------------------------------ 267 # 268 # doctest boilerplate 269 #
270 -def _runDoctests(verbose=None): # pragma: nocover
271 import sys 272 import doctest 273 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose) 274 sys.exit(failed) 275 276 277 if __name__ == '__main__': # pragma: nocover 278 _runDoctests() 279