1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32 """
33 Fragmentation algorithm
34 -----------------------
35
36 identify acyclic bonds
37 enumerate all single cuts
38 make sure you chop off more that 1 atom
39 keeps bits which are >60% query mol
40 enumerate all double cuts
41 keeps bits with 1 attachment point (i.e throw middle bit away)
42 need to be >60% query mol
43
44 identify exocyclic bonds
45 enumerate all single "ring" cuts
46 Check if it results in more that one component
47 keep correct bit if >40% query mol
48
49 enumerate successful "rings" cuts with an acyclic cut
50 Check if it results in more that one component
51 keep correct if >60% query mol
52
53 """
54 from itertools import combinations
55 import sys
56
57 from rdkit import Chem, DataStructs
58 from rdkit.Chem import rdqueries
59
60
61
62 rdkitFpParams = {'maxPath': 5, 'fpSize': 1024, 'nBitsPerHash': 2}
63
64
65 FTYPE_ACYCLIC = 'acyclic'
66 FTYPE_CYCLIC = 'cyclic'
67 FTYPE_CYCLIC_ACYCLIC = 'cyclic_and_acyclic'
68
69
70
71
72 ACYC_SMARTS = Chem.MolFromSmarts("[*]!@!=!#[*]")
73
74 CYC_SMARTS = Chem.MolFromSmarts("[R1,R2]@[r;!R1]")
75
76
77
78
79
80 cSma1 = Chem.MolFromSmarts("[#0][r].[r][#0]")
81 cSma2 = Chem.MolFromSmarts("[#0][r][#0]")
82 dummyAtomQuery = rdqueries.AtomNumEqualsQueryAtom(0)
83
84
86 """ Fragment molecule on bonds and reduce to fraggle fragmentation SMILES.
87 If none exists, returns None """
88
89
90 bondIdx = [mol.GetBondBetweenAtoms(*bond).GetIdx() for bond in bonds]
91 modifiedMol = Chem.FragmentOnBonds(mol, bondIdx, dummyLabels=[(0, 0)] * len(bondIdx))
92
93
94
95 Chem.SanitizeMol(modifiedMol, Chem.SanitizeFlags.SANITIZE_PROPERTIES |
96 Chem.SanitizeFlags.SANITIZE_SYMMRINGS)
97
98 fragments = Chem.GetMolFrags(modifiedMol, asMols=True, sanitizeFrags=False)
99 return select_fragments(fragments, ftype, hac)
100
101
103 if ftype == FTYPE_ACYCLIC:
104 result = []
105 result_hcount = 0
106 for fMol in fragments:
107 nAttachments = len(fMol.GetAtomsMatchingQuery(dummyAtomQuery))
108
109 if nAttachments == 1:
110 fhac = fMol.GetNumAtoms()
111
112
113
114
115
116 if fhac > 3:
117 result.append(Chem.MolToSmiles(fMol))
118 result_hcount += fhac
119
120
121 if result and (result_hcount > 0.6 * hac):
122 result = '.'.join(result)
123 else:
124 result = None
125 return result
126
127 elif ftype == FTYPE_CYCLIC:
128
129 if len(fragments) != 2:
130 return None
131 result = None
132 for fMol in fragments:
133 f = Chem.MolToSmiles(fMol)
134
135
136 if isValidRingCut(fMol):
137 result_hcount = fMol.GetNumAtoms()
138 if (result_hcount > 3) and (result_hcount > 0.4 * hac):
139 result = f
140 return result
141
142 elif (ftype == FTYPE_CYCLIC_ACYCLIC):
143
144
145 result = []
146 result_hcount = 0
147 for fMol in fragments:
148 nAttachments = len(fMol.GetAtomsMatchingQuery(dummyAtomQuery))
149
150 if nAttachments >= 3:
151 continue
152 fhac = fMol.GetNumAtoms()
153 if fhac <= 3:
154 continue
155
156 if nAttachments == 2:
157
158 if isValidRingCut(fMol):
159 result.append(Chem.MolToSmiles(fMol))
160 result_hcount += fhac
161 elif nAttachments == 1:
162 result.append(Chem.MolToSmiles(fMol))
163 result_hcount += fhac
164
165
166
167 if len(result) == 2 and result_hcount > 0.6 * hac:
168 result = '.'.join(result)
169 else:
170 result = None
171 return result
172
173 else:
174 raise NotImplementedError('Invalid fragmentation type {0}'.format(ftype))
175
176
178 """ to check is a fragment is a valid ring cut, it needs to match the
179 SMARTS: [$([#0][r].[r][#0]),$([#0][r][#0])] """
180
181 Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_SYMMRINGS)
182 return mol.HasSubstructMatch(cSma1) or mol.HasSubstructMatch(cSma2)
183
184
186 """ Create all possible fragmentations for molecule
187 >>> q = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12')
188 >>> fragments = generate_fraggle_fragmentation(q)
189 >>> fragments = sorted(['.'.join(sorted(s.split('.'))) for s in fragments])
190 >>> fragments
191 ['[*]C(=O)NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1',
192 '[*]C(=O)c1cncc(C)c1.[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1',
193 '[*]C(=O)c1cncc(C)c1.[*]Cc1cc(OC)c2ccccc2c1OC',
194 '[*]C(=O)c1cncc(C)c1.[*]c1cc(OC)c2ccccc2c1OC',
195 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1',
196 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1.[*]c1cncc(C)c1',
197 '[*]Cc1cc(OC)c2ccccc2c1OC.[*]NC(=O)c1cncc(C)c1',
198 '[*]Cc1cc(OC)c2ccccc2c1OC.[*]c1cncc(C)c1',
199 '[*]N1CCC(NC(=O)c2cncc(C)c2)CC1.[*]c1cc(OC)c2ccccc2c1OC',
200 '[*]NC(=O)c1cncc(C)c1.[*]c1cc(OC)c2ccccc2c1OC',
201 '[*]NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1',
202 '[*]NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1.[*]c1cncc(C)c1',
203 '[*]c1c(CN2CCC(NC(=O)c3cncc(C)c3)CC2)cc(OC)c2ccccc12',
204 '[*]c1c(OC)cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c1[*]',
205 '[*]c1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12',
206 '[*]c1cc(OC)c2ccccc2c1OC.[*]c1cncc(C)c1']
207 """
208
209 hac = mol.GetNumAtoms()
210
211
212 acyclic_matching_atoms = mol.GetSubstructMatches(ACYC_SMARTS)
213 cyclic_matching_atoms = mol.GetSubstructMatches(CYC_SMARTS)
214 if verbose:
215 print("Matching Atoms:")
216 print("acyclic matching atoms: ", acyclic_matching_atoms)
217 print("cyclic matching atoms: ", cyclic_matching_atoms)
218
219
220
221 out_fragments = set()
222
223
224
225
226
227
228
229 for bond1, bond2 in combinations(acyclic_matching_atoms, 2):
230 fragment = delete_bonds(mol, [bond1, bond2], FTYPE_ACYCLIC, hac)
231 if fragment is not None:
232 out_fragments.add(fragment)
233
234
235
236
237 for bond1, bond2 in combinations(cyclic_matching_atoms, 2):
238 fragment = delete_bonds(mol, [bond1, bond2], FTYPE_CYCLIC, hac)
239 if fragment is None:
240 continue
241 out_fragments.add(fragment)
242
243 for abond in acyclic_matching_atoms:
244 fragment = delete_bonds(mol, [bond1, bond2, abond], FTYPE_CYCLIC_ACYCLIC, hac)
245 if fragment is not None:
246 out_fragments.add(fragment)
247
248 return sorted(out_fragments)
249
250
252 """ atomContrib algorithm
253 generate fp of query_substructs (qfp)
254
255 loop through atoms of smiles
256 For each atom
257 Generate partial fp of the atom (pfp)
258 Find Tversky sim of pfp in qfp
259 If Tversky < 0.8, mark atom in smiles
260
261 Loop through marked atoms
262 If marked atom in ring - turn all atoms in that ring to * (aromatic) or Sc (aliphatic)
263 For each marked atom
264 If aromatic turn to a *
265 If aliphatic turn to a Sc
266
267 Return modified smiles
268 """
269
270 def partialSimilarity(atomID):
271 """ Determine similarity for the atoms set by atomID """
272
273 modifiedFP = DataStructs.ExplicitBitVect(1024)
274 modifiedFP.SetBitsFromList(aBits[atomID])
275 return DataStructs.TverskySimilarity(subsFp, modifiedFP, 0, 1)
276
277
278 pMol = Chem.Mol(mol)
279 aBits = []
280 _ = Chem.RDKFingerprint(pMol, atomBits=aBits, **rdkitFpParams)
281
282
283 qsMol = Chem.MolFromSmiles(subs)
284 subsFp = Chem.RDKFingerprint(qsMol, **rdkitFpParams)
285
286
287 marked = set()
288 for atom in pMol.GetAtoms():
289 if partialSimilarity(atom.GetIdx()) < tverskyThresh:
290 marked.add(atom.GetIdx())
291
292
293
294
295 markRingAtoms = set()
296 for ring in pMol.GetRingInfo().AtomRings():
297 if any(ringAtom in marked for ringAtom in ring):
298 markRingAtoms.update(ring)
299 marked.update(markRingAtoms)
300
301 if marked:
302
303 for idx in marked:
304 if pMol.GetAtomWithIdx(idx).GetIsAromatic():
305 pMol.GetAtomWithIdx(idx).SetAtomicNum(0)
306 pMol.GetAtomWithIdx(idx).SetNoImplicit(True)
307 else:
308
309 pMol.GetAtomWithIdx(idx).SetAtomicNum(21)
310
311
312
313 try:
314 Chem.SanitizeMol(pMol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE ^
315 Chem.SANITIZE_SETAROMATICITY)
316 except Exception:
317 sys.stderr.write("Can't parse smiles: %s\n" % (Chem.MolToSmiles(pMol)))
318 pMol = Chem.Mol(mol)
319
320 return pMol
321
322
323 modified_query_fps = {}
324
325
327 qFP = Chem.RDKFingerprint(qMol, **rdkitFpParams)
328 iFP = Chem.RDKFingerprint(inMol, **rdkitFpParams)
329
330 rdkit_sim = DataStructs.TanimotoSimilarity(qFP, iFP)
331
332 qm_key = "%s_%s" % (qSubs, qSmi)
333 if qm_key in modified_query_fps:
334 qmMolFp = modified_query_fps[qm_key]
335 else:
336 qmMol = atomContrib(qSubs, qMol, tverskyThresh)
337 qmMolFp = Chem.RDKFingerprint(qmMol, **rdkitFpParams)
338 modified_query_fps[qm_key] = qmMolFp
339
340 rmMol = atomContrib(qSubs, inMol, tverskyThresh)
341
342
343 try:
344 rmMolFp = Chem.RDKFingerprint(rmMol, **rdkitFpParams)
345 fraggle_sim = max(DataStructs.FingerprintSimilarity(qmMolFp, rmMolFp), rdkit_sim)
346 except Exception:
347 sys.stderr.write("Can't generate fp for: %s\n" % (Chem.MolToSmiles(rmMol, True)))
348 fraggle_sim = 0.0
349
350 return rdkit_sim, fraggle_sim
351
352
354 """ return the Fraggle similarity between two molecules
355
356 >>> q = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12')
357 >>> m = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3ccccc3)CC2)c(OC)c2ccccc12')
358 >>> sim,match = GetFraggleSimilarity(q,m)
359 >>> sim
360 0.980...
361 >>> match
362 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1'
363
364 >>> m = Chem.MolFromSmiles('COc1cc(CN2CCC(Nc3nc4ccccc4s3)CC2)c(OC)c2ccccc12')
365 >>> sim,match = GetFraggleSimilarity(q,m)
366 >>> sim
367 0.794...
368 >>> match
369 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1'
370
371 >>> q = Chem.MolFromSmiles('COc1ccccc1')
372 >>> sim,match = GetFraggleSimilarity(q,m)
373 >>> sim
374 0.347...
375 >>> match
376 '[*]c1ccccc1'
377
378 """
379 if hasattr(queryMol, '_fraggleDecomp'):
380 frags = queryMol._fraggleDecomp
381 else:
382 frags = generate_fraggle_fragmentation(queryMol)
383 queryMol._fraggleDecomp = frags
384 qSmi = Chem.MolToSmiles(queryMol, True)
385 result = 0.0
386 bestMatch = None
387 for frag in frags:
388 _, fragsim = compute_fraggle_similarity_for_subs(refMol, queryMol, qSmi, frag, tverskyThresh)
389 if fragsim > result:
390 result = fragsim
391 bestMatch = frag
392 return result, bestMatch
393
394
395
396
397
398
400 import doctest
401 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS + doctest.NORMALIZE_WHITESPACE,
402 verbose=verbose)
403 sys.exit(failed)
404
405
406 if __name__ == '__main__':
407 _runDoctests()
408