1
2
3
4
5
6
7
8
9
10 from __future__ import print_function
11 from rdkit import Chem
12 from rdkit.Chem import rdMolDescriptors
13 import math
14
15
17 """
18
19 **Arguments**:
20
21 - the code to be considered
22
23 - branchSubtract: (optional) the constant that was subtracted off
24 the number of neighbors before integrating it into the code.
25 This is used by the topological torsions code.
26
27 - includeChirality: (optional) Determines whether or not chirality
28 was included when generating the atom code.
29
30 >>> m = Chem.MolFromSmiles('C=CC(=O)O')
31 >>> code = GetAtomCode(m.GetAtomWithIdx(0))
32 >>> ExplainAtomCode(code)
33 ('C', 1, 1)
34 >>> code = GetAtomCode(m.GetAtomWithIdx(1))
35 >>> ExplainAtomCode(code)
36 ('C', 2, 1)
37 >>> code = GetAtomCode(m.GetAtomWithIdx(2))
38 >>> ExplainAtomCode(code)
39 ('C', 3, 1)
40 >>> code = GetAtomCode(m.GetAtomWithIdx(3))
41 >>> ExplainAtomCode(code)
42 ('O', 1, 1)
43 >>> code = GetAtomCode(m.GetAtomWithIdx(4))
44 >>> ExplainAtomCode(code)
45 ('O', 1, 0)
46
47 we can do chirality too, that returns an extra element in the tuple:
48 >>> m = Chem.MolFromSmiles('C[C@H](F)Cl')
49 >>> code = GetAtomCode(m.GetAtomWithIdx(1))
50 >>> ExplainAtomCode(code)
51 ('C', 3, 0)
52 >>> code = GetAtomCode(m.GetAtomWithIdx(1),includeChirality=True)
53 >>> ExplainAtomCode(code,includeChirality=True)
54 ('C', 3, 0, 'R')
55
56 note that if we don't ask for chirality, we get the right answer even if
57 the atom code was calculated with chirality:
58 >>> ExplainAtomCode(code)
59 ('C', 3, 0)
60
61 non-chiral atoms return '' in the 4th field:
62 >>> code = GetAtomCode(m.GetAtomWithIdx(0),includeChirality=True)
63 >>> ExplainAtomCode(code,includeChirality=True)
64 ('C', 1, 0, '')
65
66 Obviously switching the chirality changes the results:
67 >>> m = Chem.MolFromSmiles('C[C@@H](F)Cl')
68 >>> code = GetAtomCode(m.GetAtomWithIdx(1),includeChirality=True)
69 >>> ExplainAtomCode(code,includeChirality=True)
70 ('C', 3, 0, 'S')
71
72 """
73 typeMask = (1 << rdMolDescriptors.AtomPairsParameters.numTypeBits) - 1
74 branchMask = (1 << rdMolDescriptors.AtomPairsParameters.numBranchBits) - 1
75 piMask = (1 << rdMolDescriptors.AtomPairsParameters.numPiBits) - 1
76 chiMask = (1 << rdMolDescriptors.AtomPairsParameters.numChiralBits) - 1
77
78 nBranch = int(code & branchMask)
79 code = code >> rdMolDescriptors.AtomPairsParameters.numBranchBits
80
81 nPi = int(code & piMask)
82 code = code >> rdMolDescriptors.AtomPairsParameters.numPiBits
83
84 typeIdx = int(code & typeMask)
85 if typeIdx < len(rdMolDescriptors.AtomPairsParameters.atomTypes):
86 atomNum = rdMolDescriptors.AtomPairsParameters.atomTypes[typeIdx]
87 atomSymbol = Chem.GetPeriodicTable().GetElementSymbol(atomNum)
88 else:
89 atomSymbol = 'X'
90
91 if not includeChirality:
92 return (atomSymbol, nBranch, nPi)
93 else:
94 code = code >> rdMolDescriptors.AtomPairsParameters.numTypeBits
95 chiDict = {0:'',1:'R',2:'S'}
96 chiCode = int(code & chiMask)
97 return (atomSymbol, nBranch, nPi, chiDict[chiCode])
98
99 GetAtomCode = rdMolDescriptors.GetAtomPairAtomCode
100
102 """ Returns the number of electrons an atom is using for pi bonding
103
104 >>> m = Chem.MolFromSmiles('C=C')
105 >>> NumPiElectrons(m.GetAtomWithIdx(0))
106 1
107
108 >>> m = Chem.MolFromSmiles('C#CC')
109 >>> NumPiElectrons(m.GetAtomWithIdx(0))
110 2
111 >>> NumPiElectrons(m.GetAtomWithIdx(1))
112 2
113
114 >>> m = Chem.MolFromSmiles('O=C=CC')
115 >>> NumPiElectrons(m.GetAtomWithIdx(0))
116 1
117 >>> NumPiElectrons(m.GetAtomWithIdx(1))
118 2
119 >>> NumPiElectrons(m.GetAtomWithIdx(2))
120 1
121 >>> NumPiElectrons(m.GetAtomWithIdx(3))
122 0
123
124 >>> m = Chem.MolFromSmiles('c1ccccc1')
125 >>> NumPiElectrons(m.GetAtomWithIdx(0))
126 1
127
128 FIX: this behaves oddly in these cases:
129 >>> m = Chem.MolFromSmiles('S(=O)(=O)')
130 >>> NumPiElectrons(m.GetAtomWithIdx(0))
131 2
132
133 >>> m = Chem.MolFromSmiles('S(=O)(=O)(O)O')
134 >>> NumPiElectrons(m.GetAtomWithIdx(0))
135 0
136
137 In the second case, the S atom is tagged as sp3 hybridized.
138
139 """
140
141 res = 0
142 if atom.GetIsAromatic():
143 res = 1
144 elif atom.GetHybridization() != Chem.HybridizationType.SP3:
145
146
147 res = atom.GetExplicitValence() - atom.GetNumExplicitHs()
148 if res < atom.GetDegree():
149 raise ValueError("explicit valence exceeds atom degree")
150 res -= atom.GetDegree()
151 return res
152
153
155 """ Returns the number of bits in common between two vectors
156
157 **Arguments**:
158
159 - two vectors (sequences of bit ids)
160
161 **Returns**: an integer
162
163 **Notes**
164
165 - the vectors must be sorted
166
167 - duplicate bit IDs are counted more than once
168
169 >>> BitsInCommon( (1,2,3,4,10), (2,4,6) )
170 2
171
172 Here's how duplicates are handled:
173 >>> BitsInCommon( (1,2,2,3,4), (2,2,4,5,6) )
174 3
175
176 """
177 res = 0
178 v2Pos = 0
179 nV2 = len(v2)
180 for val in v1:
181 while v2Pos < nV2 and v2[v2Pos] < val:
182 v2Pos += 1
183 if v2Pos >= nV2:
184 break
185 if v2[v2Pos] == val:
186 res += 1
187 v2Pos += 1
188 return res
189
190
192 """ Implements the DICE similarity metric.
193 This is the recommended metric in both the Topological torsions
194 and Atom pairs papers.
195
196 **Arguments**:
197
198 - two vectors (sequences of bit ids)
199
200 **Returns**: a float.
201
202 **Notes**
203
204 - the vectors must be sorted
205
206
207 >>> DiceSimilarity( (1,2,3), (1,2,3) )
208 1.0
209 >>> DiceSimilarity( (1,2,3), (5,6) )
210 0.0
211 >>> DiceSimilarity( (1,2,3,4), (1,3,5,7) )
212 0.5
213 >>> DiceSimilarity( (1,2,3,4,5,6), (1,3) )
214 0.5
215
216 Note that duplicate bit IDs count multiple times:
217 >>> DiceSimilarity( (1,1,3,4,5,6), (1,1) )
218 0.5
219
220 but only if they are duplicated in both vectors:
221 >>> DiceSimilarity( (1,1,3,4,5,6), (1,) )==2./7
222 True
223
224 edge case
225 >>> DiceSimilarity( (), () )
226 0.0
227
228 and bounds check
229 >>> DiceSimilarity( (1,1,3,4), (1,1))
230 0.666...
231 >>> DiceSimilarity( (1,1,3,4), (1,1), bounds=0.3)
232 0.666...
233 >>> DiceSimilarity( (1,1,3,4), (1,1), bounds=0.33)
234 0.666...
235 >>> DiceSimilarity( (1,1,3,4,5,6), (1,1), bounds=0.34)
236 0.0
237
238 """
239 denom = 1.0 * (len(v1) + len(v2))
240 if not denom:
241 res = 0.0
242 else:
243 if bounds and (min(len(v1), len(v2)) / denom) < bounds:
244 numer = 0.0
245 else:
246 numer = 2.0 * BitsInCommon(v1, v2)
247 res = numer / denom
248 return res
249
250
252 """ Returns the Dot product between two vectors:
253
254 **Arguments**:
255
256 - two vectors (sequences of bit ids)
257
258 **Returns**: an integer
259
260 **Notes**
261
262 - the vectors must be sorted
263
264 - duplicate bit IDs are counted more than once
265
266 >>> Dot( (1,2,3,4,10), (2,4,6) )
267 2
268
269 Here's how duplicates are handled:
270 >>> Dot( (1,2,2,3,4), (2,2,4,5,6) )
271 5
272 >>> Dot( (1,2,2,3,4), (2,4,5,6) )
273 2
274 >>> Dot( (1,2,2,3,4), (5,6) )
275 0
276 >>> Dot( (), (5,6) )
277 0
278
279 """
280 res = 0
281 nV1 = len(v1)
282 nV2 = len(v2)
283 i = 0
284 j = 0
285 while i < nV1:
286 v1Val = v1[i]
287 v1Count = 1
288 i += 1
289 while i < nV1 and v1[i] == v1Val:
290 v1Count += 1
291 i += 1
292 while j < nV2 and v2[j] < v1Val:
293 j += 1
294 if j < nV2 and v2[j] == v1Val:
295 v2Count = 1
296 j += 1
297 while j < nV2 and v2[j] == v1Val:
298 v2Count += 1
299 j += 1
300 commonCount = min(v1Count, v2Count)
301 res += commonCount * commonCount
302 elif j >= nV2:
303 break
304 return res
305
306
308 """ Implements the Cosine similarity metric.
309 This is the recommended metric in the LaSSI paper
310
311 **Arguments**:
312
313 - two vectors (sequences of bit ids)
314
315 **Returns**: a float.
316
317 **Notes**
318
319 - the vectors must be sorted
320
321 >>> print('%.3f'%CosineSimilarity( (1,2,3,4,10), (2,4,6) ))
322 0.516
323 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), (2,2,4,5,6) ))
324 0.714
325 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), (1,2,2,3,4) ))
326 1.000
327 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), (5,6,7) ))
328 0.000
329 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), () ))
330 0.000
331
332 """
333 d1 = Dot(v1, v1)
334 d2 = Dot(v2, v2)
335 denom = math.sqrt(d1 * d2)
336 if not denom:
337 res = 0.0
338 else:
339 numer = Dot(v1, v2)
340 res = numer / denom
341 return res
342
343
344
345
346
347
349 import sys
350 import doctest
351 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose)
352 sys.exit(failed)
353
354
355 if __name__ == '__main__':
356 _runDoctests()
357