1
2
3
4
5
6 from __future__ import print_function
7
8 import bisect
9
10 from rdkit import DataStructs
11 from rdkit.DataStructs.TopNContainer import TopNContainer
12
13
15 _picks = None
16
18 raise NotImplementedError("GenericPicker is a virtual base class")
19
24
29
30
32 """ A class for picking the top N overall best matches across a library
33
34 Connect to a database and build molecules:
35 >>> from rdkit import Chem
36 >>> from rdkit import RDConfig
37 >>> import os.path
38 >>> from rdkit.Dbase.DbConnection import DbConnect
39 >>> dbName = RDConfig.RDTestDatabase
40 >>> conn = DbConnect(dbName,'simple_mols1')
41 >>> [x.upper() for x in conn.GetColumnNames()]
42 ['SMILES', 'ID']
43 >>> mols = []
44 >>> for smi,id in conn.GetData():
45 ... mol = Chem.MolFromSmiles(str(smi))
46 ... mol.SetProp('_Name',str(id))
47 ... mols.append(mol)
48 >>> len(mols)
49 12
50
51 Calculate fingerprints:
52 >>> probefps = []
53 >>> for mol in mols:
54 ... fp = Chem.RDKFingerprint(mol)
55 ... fp._id = mol.GetProp('_Name')
56 ... probefps.append(fp)
57
58 Start by finding the top matches for a single probe. This ether should pull
59 other ethers from the db:
60 >>> mol = Chem.MolFromSmiles('COC')
61 >>> probeFp = Chem.RDKFingerprint(mol)
62 >>> picker = TopNOverallPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps)
63 >>> len(picker)
64 2
65 >>> fp,score = picker[0]
66 >>> id = fp._id
67 >>> str(id)
68 'ether-1'
69 >>> score
70 1.0
71
72 The results come back in order:
73 >>> fp,score = picker[1]
74 >>> id = fp._id
75 >>> str(id)
76 'ether-2'
77
78 Now find the top matches for 2 probes. We'll get one ether and one acid:
79 >>> fps = []
80 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC')))
81 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O')))
82 >>> picker = TopNOverallPicker(numToPick=3,probeFps=fps,dataSet=probefps)
83 >>> len(picker)
84 3
85 >>> fp,score = picker[0]
86 >>> id = fp._id
87 >>> str(id)
88 'acid-1'
89 >>> fp,score = picker[1]
90 >>> id = fp._id
91 >>> str(id)
92 'ether-1'
93 >>> score
94 1.0
95 >>> fp,score = picker[2]
96 >>> id = fp._id
97 >>> str(id)
98 'acid-2'
99
100 """
101
104 """
105
106 dataSet should be a sequence of BitVectors
107
108 """
109 self.numToPick = numToPick
110 self.probes = probeFps
111 self.data = dataSet
112 self.simMetric = simMetric
113 self._picks = None
114
116 if self._picks is not None and not force:
117 return
118 picks = TopNContainer(self.numToPick)
119 for fp in self.data:
120 origFp = fp
121 bestScore = -1.0
122 for probeFp in self.probes:
123 score = DataStructs.FingerprintSimilarity(origFp, probeFp, self.simMetric)
124 bestScore = max(score, bestScore)
125 picks.Insert(bestScore, fp)
126 self._picks = []
127 for score, pt in picks:
128 self._picks.append((pt, score))
129 self._picks.reverse()
130
131
133 """ A class for picking the best matches across a library
134
135 Connect to a database:
136 >>> from rdkit import Chem
137 >>> from rdkit import RDConfig
138 >>> import os.path
139 >>> from rdkit.Dbase.DbConnection import DbConnect
140 >>> dbName = RDConfig.RDTestDatabase
141 >>> conn = DbConnect(dbName,'simple_mols1')
142 >>> [x.upper() for x in conn.GetColumnNames()]
143 ['SMILES', 'ID']
144 >>> mols = []
145 >>> for smi,id in conn.GetData():
146 ... mol = Chem.MolFromSmiles(str(smi))
147 ... mol.SetProp('_Name',str(id))
148 ... mols.append(mol)
149 >>> len(mols)
150 12
151
152 Calculate fingerprints:
153 >>> probefps = []
154 >>> for mol in mols:
155 ... fp = Chem.RDKFingerprint(mol)
156 ... fp._id = mol.GetProp('_Name')
157 ... probefps.append(fp)
158
159 Start by finding the top matches for a single probe. This ether should pull
160 other ethers from the db:
161 >>> mol = Chem.MolFromSmiles('COC')
162 >>> probeFp = Chem.RDKFingerprint(mol)
163 >>> picker = SpreadPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps)
164 >>> len(picker)
165 2
166 >>> fp,score = picker[0]
167 >>> id = fp._id
168 >>> str(id)
169 'ether-1'
170 >>> score
171 1.0
172
173 The results come back in order:
174 >>> fp,score = picker[1]
175 >>> id = fp._id
176 >>> str(id)
177 'ether-2'
178
179 Now find the top matches for 2 probes. We'll get one ether and one acid:
180 >>> fps = []
181 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC')))
182 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O')))
183 >>> picker = SpreadPicker(numToPick=3,probeFps=fps,dataSet=probefps)
184 >>> len(picker)
185 3
186 >>> fp,score = picker[0]
187 >>> id = fp._id
188 >>> str(id)
189 'ether-1'
190 >>> score
191 1.0
192 >>> fp,score = picker[1]
193 >>> id = fp._id
194 >>> str(id)
195 'acid-1'
196 >>> score
197 1.0
198 >>> fp,score = picker[2]
199 >>> id = fp._id
200 >>> str(id)
201 'ether-2'
202
203 """
204
207 """
208
209 dataSet should be a sequence of BitVectors or, if expectPickles
210 is False, a set of strings that can be converted to bit vectors
211
212 """
213 self.numToPick = numToPick
214 self.probes = probeFps
215 self.data = dataSet
216 self.simMetric = simMetric
217 self.expectPickles = expectPickles
218 self.onlyNames = onlyNames
219
220 self._picks = None
221
222 - def MakePicks(self, force=False, silent=False):
223 if self._picks is not None and not force:
224 return
225
226
227
228 nProbes = len(self.probes)
229 scores = [None] * nProbes
230 for i in range(nProbes):
231 scores[i] = []
232 j = 0
233 fps = []
234 for origFp in self.data:
235 for i in range(nProbes):
236 score = DataStructs.FingerprintSimilarity(self.probes[i], origFp, self.simMetric)
237 bisect.insort(scores[i], (score, j))
238 if len(scores[i]) >= self.numToPick:
239 del scores[self.numToPick:]
240 if self.onlyNames and hasattr(origFp, '_fieldsFromDb'):
241 fps.append(origFp._fieldsFromDb[0])
242 else:
243 fps.append(origFp)
244 j += 1
245 if not silent and not j % 1000:
246 print('scored %d fps' % j)
247
248
249 nPicked = 0
250 self._picks = []
251 taken = [0] * len(fps)
252 while nPicked < self.numToPick:
253 rowIdx = nPicked % len(scores)
254 row = scores[rowIdx]
255 score, idx = row.pop()
256
257 while taken[idx] and len(row):
258 score, idx = row.pop()
259 if not taken[idx]:
260 fp = fps[idx]
261 self._picks.append((fp, score))
262 taken[idx] = 1
263 nPicked += 1
264
265
266
267
268
269
271 import sys
272 import doctest
273 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose)
274 sys.exit(failed)
275
276
277 if __name__ == '__main__':
278 _runDoctests()
279