Package rdkit :: Package Chem :: Package Fraggle :: Module FraggleSim
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fraggle.FraggleSim

  1  # Copyright (c) 2013, GlaxoSmithKline Research & Development Ltd. 
  2  # All rights reserved. 
  3  # 
  4  # Redistribution and use in source and binary forms, with or without 
  5  # modification, are permitted provided that the following conditions are 
  6  # met: 
  7  # 
  8  #     * Redistributions of source code must retain the above copyright 
  9  #       notice, this list of conditions and the following disclaimer. 
 10  #     * Redistributions in binary form must reproduce the above 
 11  #       copyright notice, this list of conditions and the following 
 12  #       disclaimer in the documentation and/or other materials provided 
 13  #       with the distribution. 
 14  #     * Neither the name of GlaxoSmithKline Research & Development Ltd. 
 15  #       nor the names of its contributors may be used to endorse or promote 
 16  #       products derived from this software without specific prior written 
 17  #       permission. 
 18  # 
 19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
 22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
 23  # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 24  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 25  # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 26  # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 27  # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 28  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 29  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 30  # 
 31  # Created by Jameed Hussain, May 2013 
 32  """ 
 33  Fragmentation algorithm 
 34  ----------------------- 
 35   
 36  identify acyclic bonds 
 37  enumerate all single cuts 
 38  make sure you chop off more that 1 atom 
 39  keeps bits which are >60% query mol 
 40  enumerate all double cuts 
 41  keeps bits with 1 attachment point (i.e throw middle bit away) 
 42  need to be >60% query mol 
 43   
 44  identify exocyclic bonds 
 45  enumerate all single "ring" cuts 
 46  Check if it results in more that one component 
 47  keep correct bit if >40% query mol 
 48   
 49  enumerate successful "rings" cuts with an acyclic cut 
 50  Check if it results in more that one component 
 51  keep correct if >60% query mol 
 52   
 53  """ 
 54  from itertools import combinations 
 55  import sys 
 56   
 57  from rdkit import Chem, DataStructs 
 58  from rdkit.Chem import rdqueries 
 59   
 60   
 61  # our default rdkit fingerprinter parameters: 
 62  rdkitFpParams = {'maxPath': 5, 'fpSize': 1024, 'nBitsPerHash': 2} 
 63   
 64  # Considered fragment types 
 65  FTYPE_ACYCLIC = 'acyclic' 
 66  FTYPE_CYCLIC = 'cyclic' 
 67  FTYPE_CYCLIC_ACYCLIC = 'cyclic_and_acyclic' 
 68   
 69  # Global SMARTS used by the program 
 70   
 71  # acyclic bond smarts 
 72  ACYC_SMARTS = Chem.MolFromSmarts("[*]!@!=!#[*]") 
 73  # exocyclic/fused exocyclic bond smarts 
 74  CYC_SMARTS = Chem.MolFromSmarts("[R1,R2]@[r;!R1]") 
 75   
 76  # smarts used to find appropriate fragment for 
 77  # would use SMARTS: [$([#0][r].[r][#0]),$([#0][r][#0])] 
 78  # but RDkit doesn't support component SMARTS in recursive one - $([#0][r].[r][#0]) 
 79  # hence split into two 
 80  cSma1 = Chem.MolFromSmarts("[#0][r].[r][#0]") 
 81  cSma2 = Chem.MolFromSmarts("[#0][r][#0]") 
 82  dummyAtomQuery = rdqueries.AtomNumEqualsQueryAtom(0) 
 83   
 84   
85 -def delete_bonds(mol, bonds, ftype, hac):
86 """ Fragment molecule on bonds and reduce to fraggle fragmentation SMILES. 87 If none exists, returns None """ 88 89 # Replace the given bonds with attachment points (B1-B2 -> B1-[*].[*]-B2) 90 bondIdx = [mol.GetBondBetweenAtoms(*bond).GetIdx() for bond in bonds] 91 modifiedMol = Chem.FragmentOnBonds(mol, bondIdx, dummyLabels=[(0, 0)] * len(bondIdx)) 92 93 # should be able to get away without sanitising mol as the valencies should be okay 94 # do not do a full sanitization, but do find rings and calculate valences: 95 Chem.SanitizeMol(modifiedMol, Chem.SanitizeFlags.SANITIZE_PROPERTIES | 96 Chem.SanitizeFlags.SANITIZE_SYMMRINGS) 97 98 fragments = Chem.GetMolFrags(modifiedMol, asMols=True, sanitizeFrags=False) 99 return select_fragments(fragments, ftype, hac)
100 101
102 -def select_fragments(fragments, ftype, hac):
103 if ftype == FTYPE_ACYCLIC: 104 result = [] 105 result_hcount = 0 106 for fMol in fragments: 107 nAttachments = len(fMol.GetAtomsMatchingQuery(dummyAtomQuery)) 108 # check if terminal fragment 109 if nAttachments == 1: 110 fhac = fMol.GetNumAtoms() 111 112 # if the fragment is 2 atoms (or less - includes attachment) it is too small 113 # to be interesting. This check has the additional benefit 114 # of pulling out the relevant single cuts as it discards 115 # fragments where we only chop off a small part of the input cmpd 116 if fhac > 3: 117 result.append(Chem.MolToSmiles(fMol)) 118 result_hcount += fhac 119 120 # needs to be greater than 60% of parent mol 121 if result and (result_hcount > 0.6 * hac): 122 result = '.'.join(result) 123 else: 124 result = None 125 return result 126 127 elif ftype == FTYPE_CYCLIC: 128 # make sure it is 2 components 129 if len(fragments) != 2: 130 return None 131 result = None 132 for fMol in fragments: 133 f = Chem.MolToSmiles(fMol) 134 # check if a valid cut 135 # needs to be greater 3 heavy atoms and greater than 40% of parent mol 136 if isValidRingCut(fMol): 137 result_hcount = fMol.GetNumAtoms() 138 if (result_hcount > 3) and (result_hcount > 0.4 * hac): 139 result = f 140 return result 141 142 elif (ftype == FTYPE_CYCLIC_ACYCLIC): 143 # need to find the fragments which are valid which means they must be: 144 # Terminal (one attachment point) or valid ring cut 145 result = [] 146 result_hcount = 0 147 for fMol in fragments: 148 nAttachments = len(fMol.GetAtomsMatchingQuery(dummyAtomQuery)) 149 # We need to have a fragment that has 1 or 2 attachment points and that has more than 3 atoms 150 if nAttachments >= 3: 151 continue 152 fhac = fMol.GetNumAtoms() 153 if fhac <= 3: 154 continue 155 156 if nAttachments == 2: 157 # check if a valid cut 158 if isValidRingCut(fMol): 159 result.append(Chem.MolToSmiles(fMol)) 160 result_hcount += fhac 161 elif nAttachments == 1: 162 result.append(Chem.MolToSmiles(fMol)) 163 result_hcount += fhac 164 165 # appropriate fragmentation must have 2 components and needs to be greater than 60% of 166 # parent mol 167 if len(result) == 2 and result_hcount > 0.6 * hac: 168 result = '.'.join(result) 169 else: 170 result = None 171 return result 172 173 else: 174 raise NotImplementedError('Invalid fragmentation type {0}'.format(ftype))
175 176
177 -def isValidRingCut(mol):
178 """ to check is a fragment is a valid ring cut, it needs to match the 179 SMARTS: [$([#0][r].[r][#0]),$([#0][r][#0])] """ 180 # At this point, the molecule requires the identification of rings, so we need to sanitize 181 Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_SYMMRINGS) 182 return mol.HasSubstructMatch(cSma1) or mol.HasSubstructMatch(cSma2)
183 184
185 -def generate_fraggle_fragmentation(mol, verbose=False):
186 """ Create all possible fragmentations for molecule 187 >>> q = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12') 188 >>> fragments = generate_fraggle_fragmentation(q) 189 >>> fragments = sorted(['.'.join(sorted(s.split('.'))) for s in fragments]) 190 >>> fragments 191 ['[*]C(=O)NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', 192 '[*]C(=O)c1cncc(C)c1.[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', 193 '[*]C(=O)c1cncc(C)c1.[*]Cc1cc(OC)c2ccccc2c1OC', 194 '[*]C(=O)c1cncc(C)c1.[*]c1cc(OC)c2ccccc2c1OC', 195 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', 196 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1.[*]c1cncc(C)c1', 197 '[*]Cc1cc(OC)c2ccccc2c1OC.[*]NC(=O)c1cncc(C)c1', 198 '[*]Cc1cc(OC)c2ccccc2c1OC.[*]c1cncc(C)c1', 199 '[*]N1CCC(NC(=O)c2cncc(C)c2)CC1.[*]c1cc(OC)c2ccccc2c1OC', 200 '[*]NC(=O)c1cncc(C)c1.[*]c1cc(OC)c2ccccc2c1OC', 201 '[*]NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', 202 '[*]NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1.[*]c1cncc(C)c1', 203 '[*]c1c(CN2CCC(NC(=O)c3cncc(C)c3)CC2)cc(OC)c2ccccc12', 204 '[*]c1c(OC)cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c1[*]', 205 '[*]c1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12', 206 '[*]c1cc(OC)c2ccccc2c1OC.[*]c1cncc(C)c1'] 207 """ 208 # query mol heavy atom count 209 hac = mol.GetNumAtoms() 210 211 # find the relevant bonds to break 212 acyclic_matching_atoms = mol.GetSubstructMatches(ACYC_SMARTS) 213 cyclic_matching_atoms = mol.GetSubstructMatches(CYC_SMARTS) 214 if verbose: 215 print("Matching Atoms:") 216 print("acyclic matching atoms: ", acyclic_matching_atoms) 217 print("cyclic matching atoms: ", cyclic_matching_atoms) 218 219 # different cuts can give the same fragments 220 # to use out_fragments to remove them 221 out_fragments = set() 222 223 ###################### 224 # Single acyclic Cuts 225 ###################### 226 # loop to generate every single and double cut in the molecule 227 # single cuts are not required as relevant single cut fragments can be found 228 # from the double cuts. For explanation see check_fragments method 229 for bond1, bond2 in combinations(acyclic_matching_atoms, 2): 230 fragment = delete_bonds(mol, [bond1, bond2], FTYPE_ACYCLIC, hac) 231 if fragment is not None: 232 out_fragments.add(fragment) 233 234 ################################## 235 # Fused/Spiro exocyclic bond Cuts 236 ################################## 237 for bond1, bond2 in combinations(cyclic_matching_atoms, 2): 238 fragment = delete_bonds(mol, [bond1, bond2], FTYPE_CYCLIC, hac) 239 if fragment is None: 240 continue 241 out_fragments.add(fragment) 242 # now do an acyclic cut with the successful cyclic cut 243 for abond in acyclic_matching_atoms: 244 fragment = delete_bonds(mol, [bond1, bond2, abond], FTYPE_CYCLIC_ACYCLIC, hac) 245 if fragment is not None: 246 out_fragments.add(fragment) 247 248 return sorted(out_fragments)
249 250
251 -def atomContrib(subs, mol, tverskyThresh=0.8):
252 """ atomContrib algorithm 253 generate fp of query_substructs (qfp) 254 255 loop through atoms of smiles 256 For each atom 257 Generate partial fp of the atom (pfp) 258 Find Tversky sim of pfp in qfp 259 If Tversky < 0.8, mark atom in smiles 260 261 Loop through marked atoms 262 If marked atom in ring - turn all atoms in that ring to * (aromatic) or Sc (aliphatic) 263 For each marked atom 264 If aromatic turn to a * 265 If aliphatic turn to a Sc 266 267 Return modified smiles 268 """ 269 270 def partialSimilarity(atomID): 271 """ Determine similarity for the atoms set by atomID """ 272 # create empty fp 273 modifiedFP = DataStructs.ExplicitBitVect(1024) 274 modifiedFP.SetBitsFromList(aBits[atomID]) 275 return DataStructs.TverskySimilarity(subsFp, modifiedFP, 0, 1)
276 277 # generate mol object & fp for input mol (we are interested in the bits each atom sets) 278 pMol = Chem.Mol(mol) 279 aBits = [] 280 _ = Chem.RDKFingerprint(pMol, atomBits=aBits, **rdkitFpParams) 281 282 # generate fp of query_substructs 283 qsMol = Chem.MolFromSmiles(subs) 284 subsFp = Chem.RDKFingerprint(qsMol, **rdkitFpParams) 285 286 # loop through atoms of smiles get atoms that have a high similarity with substructure 287 marked = set() 288 for atom in pMol.GetAtoms(): 289 if partialSimilarity(atom.GetIdx()) < tverskyThresh: 290 marked.add(atom.GetIdx()) 291 292 # get rings to change 293 294 # If a marked atom is within a ring, mark the whole ring 295 markRingAtoms = set() 296 for ring in pMol.GetRingInfo().AtomRings(): 297 if any(ringAtom in marked for ringAtom in ring): 298 markRingAtoms.update(ring) 299 marked.update(markRingAtoms) 300 301 if marked: 302 # now mutate the marked atoms 303 for idx in marked: 304 if pMol.GetAtomWithIdx(idx).GetIsAromatic(): 305 pMol.GetAtomWithIdx(idx).SetAtomicNum(0) 306 pMol.GetAtomWithIdx(idx).SetNoImplicit(True) 307 else: 308 # gives best sim 309 pMol.GetAtomWithIdx(idx).SetAtomicNum(21) 310 # works better but when replace S it fails due to valency 311 # pMol.GetAtomWithIdx(idx).SetAtomicNum(6) 312 313 try: 314 Chem.SanitizeMol(pMol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE ^ 315 Chem.SANITIZE_SETAROMATICITY) 316 except Exception: 317 sys.stderr.write("Can't parse smiles: %s\n" % (Chem.MolToSmiles(pMol))) 318 pMol = Chem.Mol(mol) 319 320 return pMol 321 322 323 modified_query_fps = {} 324 325
326 -def compute_fraggle_similarity_for_subs(inMol, qMol, qSmi, qSubs, tverskyThresh=0.8):
327 qFP = Chem.RDKFingerprint(qMol, **rdkitFpParams) 328 iFP = Chem.RDKFingerprint(inMol, **rdkitFpParams) 329 330 rdkit_sim = DataStructs.TanimotoSimilarity(qFP, iFP) 331 332 qm_key = "%s_%s" % (qSubs, qSmi) 333 if qm_key in modified_query_fps: 334 qmMolFp = modified_query_fps[qm_key] 335 else: 336 qmMol = atomContrib(qSubs, qMol, tverskyThresh) 337 qmMolFp = Chem.RDKFingerprint(qmMol, **rdkitFpParams) 338 modified_query_fps[qm_key] = qmMolFp 339 340 rmMol = atomContrib(qSubs, inMol, tverskyThresh) 341 342 # wrap in a try, catch 343 try: 344 rmMolFp = Chem.RDKFingerprint(rmMol, **rdkitFpParams) 345 fraggle_sim = max(DataStructs.FingerprintSimilarity(qmMolFp, rmMolFp), rdkit_sim) 346 except Exception: 347 sys.stderr.write("Can't generate fp for: %s\n" % (Chem.MolToSmiles(rmMol, True))) 348 fraggle_sim = 0.0 349 350 return rdkit_sim, fraggle_sim
351 352
353 -def GetFraggleSimilarity(queryMol, refMol, tverskyThresh=0.8):
354 """ return the Fraggle similarity between two molecules 355 356 >>> q = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12') 357 >>> m = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3ccccc3)CC2)c(OC)c2ccccc12') 358 >>> sim,match = GetFraggleSimilarity(q,m) 359 >>> sim 360 0.980... 361 >>> match 362 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1' 363 364 >>> m = Chem.MolFromSmiles('COc1cc(CN2CCC(Nc3nc4ccccc4s3)CC2)c(OC)c2ccccc12') 365 >>> sim,match = GetFraggleSimilarity(q,m) 366 >>> sim 367 0.794... 368 >>> match 369 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1' 370 371 >>> q = Chem.MolFromSmiles('COc1ccccc1') 372 >>> sim,match = GetFraggleSimilarity(q,m) 373 >>> sim 374 0.347... 375 >>> match 376 '[*]c1ccccc1' 377 378 """ 379 if hasattr(queryMol, '_fraggleDecomp'): 380 frags = queryMol._fraggleDecomp 381 else: 382 frags = generate_fraggle_fragmentation(queryMol) 383 queryMol._fraggleDecomp = frags 384 qSmi = Chem.MolToSmiles(queryMol, True) 385 result = 0.0 386 bestMatch = None 387 for frag in frags: 388 _, fragsim = compute_fraggle_similarity_for_subs(refMol, queryMol, qSmi, frag, tverskyThresh) 389 if fragsim > result: 390 result = fragsim 391 bestMatch = frag 392 return result, bestMatch
393 394 395 # ------------------------------------ 396 # 397 # doctest boilerplate 398 #
399 -def _runDoctests(verbose=None): # pragma: nocover
400 import doctest 401 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS + doctest.NORMALIZE_WHITESPACE, 402 verbose=verbose) 403 sys.exit(failed) 404 405 406 if __name__ == '__main__': # pragma: nocover 407 _runDoctests() 408