Package rdkit :: Package Chem :: Package AtomPairs :: Module Utils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.AtomPairs.Utils

  1  # 
  2  #  Copyright (C) 2004-2017 Rational Discovery LLC 
  3  # 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  from __future__ import print_function 
 11  from rdkit import Chem 
 12  from rdkit.Chem import rdMolDescriptors 
 13  import math 
 14   
 15   
16 -def ExplainAtomCode(code, branchSubtract=0, includeChirality=False):
17 """ 18 19 **Arguments**: 20 21 - the code to be considered 22 23 - branchSubtract: (optional) the constant that was subtracted off 24 the number of neighbors before integrating it into the code. 25 This is used by the topological torsions code. 26 27 - includeChirality: (optional) Determines whether or not chirality 28 was included when generating the atom code. 29 30 >>> m = Chem.MolFromSmiles('C=CC(=O)O') 31 >>> code = GetAtomCode(m.GetAtomWithIdx(0)) 32 >>> ExplainAtomCode(code) 33 ('C', 1, 1) 34 >>> code = GetAtomCode(m.GetAtomWithIdx(1)) 35 >>> ExplainAtomCode(code) 36 ('C', 2, 1) 37 >>> code = GetAtomCode(m.GetAtomWithIdx(2)) 38 >>> ExplainAtomCode(code) 39 ('C', 3, 1) 40 >>> code = GetAtomCode(m.GetAtomWithIdx(3)) 41 >>> ExplainAtomCode(code) 42 ('O', 1, 1) 43 >>> code = GetAtomCode(m.GetAtomWithIdx(4)) 44 >>> ExplainAtomCode(code) 45 ('O', 1, 0) 46 47 we can do chirality too, that returns an extra element in the tuple: 48 >>> m = Chem.MolFromSmiles('C[C@H](F)Cl') 49 >>> code = GetAtomCode(m.GetAtomWithIdx(1)) 50 >>> ExplainAtomCode(code) 51 ('C', 3, 0) 52 >>> code = GetAtomCode(m.GetAtomWithIdx(1),includeChirality=True) 53 >>> ExplainAtomCode(code,includeChirality=True) 54 ('C', 3, 0, 'R') 55 56 note that if we don't ask for chirality, we get the right answer even if 57 the atom code was calculated with chirality: 58 >>> ExplainAtomCode(code) 59 ('C', 3, 0) 60 61 non-chiral atoms return '' in the 4th field: 62 >>> code = GetAtomCode(m.GetAtomWithIdx(0),includeChirality=True) 63 >>> ExplainAtomCode(code,includeChirality=True) 64 ('C', 1, 0, '') 65 66 Obviously switching the chirality changes the results: 67 >>> m = Chem.MolFromSmiles('C[C@@H](F)Cl') 68 >>> code = GetAtomCode(m.GetAtomWithIdx(1),includeChirality=True) 69 >>> ExplainAtomCode(code,includeChirality=True) 70 ('C', 3, 0, 'S') 71 72 """ 73 typeMask = (1 << rdMolDescriptors.AtomPairsParameters.numTypeBits) - 1 74 branchMask = (1 << rdMolDescriptors.AtomPairsParameters.numBranchBits) - 1 75 piMask = (1 << rdMolDescriptors.AtomPairsParameters.numPiBits) - 1 76 chiMask = (1 << rdMolDescriptors.AtomPairsParameters.numChiralBits) - 1 77 78 nBranch = int(code & branchMask) 79 code = code >> rdMolDescriptors.AtomPairsParameters.numBranchBits 80 81 nPi = int(code & piMask) 82 code = code >> rdMolDescriptors.AtomPairsParameters.numPiBits 83 84 typeIdx = int(code & typeMask) 85 if typeIdx < len(rdMolDescriptors.AtomPairsParameters.atomTypes): 86 atomNum = rdMolDescriptors.AtomPairsParameters.atomTypes[typeIdx] 87 atomSymbol = Chem.GetPeriodicTable().GetElementSymbol(atomNum) 88 else: 89 atomSymbol = 'X' 90 91 if not includeChirality: 92 return (atomSymbol, nBranch, nPi) 93 else: 94 code = code >> rdMolDescriptors.AtomPairsParameters.numTypeBits 95 chiDict = {0:'',1:'R',2:'S'} 96 chiCode = int(code & chiMask) 97 return (atomSymbol, nBranch, nPi, chiDict[chiCode])
98 99 GetAtomCode = rdMolDescriptors.GetAtomPairAtomCode 100
101 -def NumPiElectrons(atom):
102 """ Returns the number of electrons an atom is using for pi bonding 103 104 >>> m = Chem.MolFromSmiles('C=C') 105 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 106 1 107 108 >>> m = Chem.MolFromSmiles('C#CC') 109 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 110 2 111 >>> NumPiElectrons(m.GetAtomWithIdx(1)) 112 2 113 114 >>> m = Chem.MolFromSmiles('O=C=CC') 115 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 116 1 117 >>> NumPiElectrons(m.GetAtomWithIdx(1)) 118 2 119 >>> NumPiElectrons(m.GetAtomWithIdx(2)) 120 1 121 >>> NumPiElectrons(m.GetAtomWithIdx(3)) 122 0 123 124 >>> m = Chem.MolFromSmiles('c1ccccc1') 125 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 126 1 127 128 FIX: this behaves oddly in these cases: 129 >>> m = Chem.MolFromSmiles('S(=O)(=O)') 130 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 131 2 132 133 >>> m = Chem.MolFromSmiles('S(=O)(=O)(O)O') 134 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 135 0 136 137 In the second case, the S atom is tagged as sp3 hybridized. 138 139 """ 140 141 res = 0 142 if atom.GetIsAromatic(): 143 res = 1 144 elif atom.GetHybridization() != Chem.HybridizationType.SP3: 145 # the number of pi electrons is just the number of 146 # unsaturations (valence - degree): 147 res = atom.GetExplicitValence() - atom.GetNumExplicitHs() 148 if res < atom.GetDegree(): 149 raise ValueError("explicit valence exceeds atom degree") 150 res -= atom.GetDegree() 151 return res
152 153
154 -def BitsInCommon(v1, v2):
155 """ Returns the number of bits in common between two vectors 156 157 **Arguments**: 158 159 - two vectors (sequences of bit ids) 160 161 **Returns**: an integer 162 163 **Notes** 164 165 - the vectors must be sorted 166 167 - duplicate bit IDs are counted more than once 168 169 >>> BitsInCommon( (1,2,3,4,10), (2,4,6) ) 170 2 171 172 Here's how duplicates are handled: 173 >>> BitsInCommon( (1,2,2,3,4), (2,2,4,5,6) ) 174 3 175 176 """ 177 res = 0 178 v2Pos = 0 179 nV2 = len(v2) 180 for val in v1: 181 while v2Pos < nV2 and v2[v2Pos] < val: 182 v2Pos += 1 183 if v2Pos >= nV2: 184 break 185 if v2[v2Pos] == val: 186 res += 1 187 v2Pos += 1 188 return res
189 190
191 -def DiceSimilarity(v1, v2, bounds=None):
192 """ Implements the DICE similarity metric. 193 This is the recommended metric in both the Topological torsions 194 and Atom pairs papers. 195 196 **Arguments**: 197 198 - two vectors (sequences of bit ids) 199 200 **Returns**: a float. 201 202 **Notes** 203 204 - the vectors must be sorted 205 206 207 >>> DiceSimilarity( (1,2,3), (1,2,3) ) 208 1.0 209 >>> DiceSimilarity( (1,2,3), (5,6) ) 210 0.0 211 >>> DiceSimilarity( (1,2,3,4), (1,3,5,7) ) 212 0.5 213 >>> DiceSimilarity( (1,2,3,4,5,6), (1,3) ) 214 0.5 215 216 Note that duplicate bit IDs count multiple times: 217 >>> DiceSimilarity( (1,1,3,4,5,6), (1,1) ) 218 0.5 219 220 but only if they are duplicated in both vectors: 221 >>> DiceSimilarity( (1,1,3,4,5,6), (1,) )==2./7 222 True 223 224 edge case 225 >>> DiceSimilarity( (), () ) 226 0.0 227 228 and bounds check 229 >>> DiceSimilarity( (1,1,3,4), (1,1)) 230 0.666... 231 >>> DiceSimilarity( (1,1,3,4), (1,1), bounds=0.3) 232 0.666... 233 >>> DiceSimilarity( (1,1,3,4), (1,1), bounds=0.33) 234 0.666... 235 >>> DiceSimilarity( (1,1,3,4,5,6), (1,1), bounds=0.34) 236 0.0 237 238 """ 239 denom = 1.0 * (len(v1) + len(v2)) 240 if not denom: 241 res = 0.0 242 else: 243 if bounds and (min(len(v1), len(v2)) / denom) < bounds: 244 numer = 0.0 245 else: 246 numer = 2.0 * BitsInCommon(v1, v2) 247 res = numer / denom 248 return res
249 250
251 -def Dot(v1, v2):
252 """ Returns the Dot product between two vectors: 253 254 **Arguments**: 255 256 - two vectors (sequences of bit ids) 257 258 **Returns**: an integer 259 260 **Notes** 261 262 - the vectors must be sorted 263 264 - duplicate bit IDs are counted more than once 265 266 >>> Dot( (1,2,3,4,10), (2,4,6) ) 267 2 268 269 Here's how duplicates are handled: 270 >>> Dot( (1,2,2,3,4), (2,2,4,5,6) ) 271 5 272 >>> Dot( (1,2,2,3,4), (2,4,5,6) ) 273 2 274 >>> Dot( (1,2,2,3,4), (5,6) ) 275 0 276 >>> Dot( (), (5,6) ) 277 0 278 279 """ 280 res = 0 281 nV1 = len(v1) 282 nV2 = len(v2) 283 i = 0 284 j = 0 285 while i < nV1: 286 v1Val = v1[i] 287 v1Count = 1 288 i += 1 289 while i < nV1 and v1[i] == v1Val: 290 v1Count += 1 291 i += 1 292 while j < nV2 and v2[j] < v1Val: 293 j += 1 294 if j < nV2 and v2[j] == v1Val: 295 v2Count = 1 296 j += 1 297 while j < nV2 and v2[j] == v1Val: 298 v2Count += 1 299 j += 1 300 commonCount = min(v1Count, v2Count) 301 res += commonCount * commonCount 302 elif j >= nV2: 303 break 304 return res
305 306
307 -def CosineSimilarity(v1, v2):
308 """ Implements the Cosine similarity metric. 309 This is the recommended metric in the LaSSI paper 310 311 **Arguments**: 312 313 - two vectors (sequences of bit ids) 314 315 **Returns**: a float. 316 317 **Notes** 318 319 - the vectors must be sorted 320 321 >>> print('%.3f'%CosineSimilarity( (1,2,3,4,10), (2,4,6) )) 322 0.516 323 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), (2,2,4,5,6) )) 324 0.714 325 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), (1,2,2,3,4) )) 326 1.000 327 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), (5,6,7) )) 328 0.000 329 >>> print('%.3f'%CosineSimilarity( (1,2,2,3,4), () )) 330 0.000 331 332 """ 333 d1 = Dot(v1, v1) 334 d2 = Dot(v2, v2) 335 denom = math.sqrt(d1 * d2) 336 if not denom: 337 res = 0.0 338 else: 339 numer = Dot(v1, v2) 340 res = numer / denom 341 return res
342 343 344 # ------------------------------------ 345 # 346 # doctest boilerplate 347 #
348 -def _runDoctests(verbose=None): # pragma: nocover
349 import sys 350 import doctest 351 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose) 352 sys.exit(failed) 353 354 355 if __name__ == '__main__': # pragma: nocover 356 _runDoctests() 357