Package rdkit :: Package Chem :: Package Pharm2D :: Module SigFactory
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Pharm2D.SigFactory

  1  # 
  2  # Copyright (C) 2003-2008 greg Landrum and Rational Discovery LLC 
  3  # 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  """ contains factory class for producing signatures 
 11   
 12   
 13  """ 
 14  from __future__ import print_function, division 
 15   
 16  import copy 
 17   
 18  import numpy 
 19   
 20  from rdkit.Chem.Pharm2D import Utils 
 21  from rdkit.DataStructs import SparseBitVect, IntSparseIntVect, LongSparseIntVect 
 22   
 23  _verbose = False 
 24   
 25   
26 -class SigFactory(object):
27 """ 28 29 SigFactory's are used by creating one, setting the relevant 30 parameters, then calling the GetSignature() method each time a 31 signature is required. 32 33 """ 34
35 - def __init__(self, featFactory, useCounts=False, minPointCount=2, maxPointCount=3, 36 shortestPathsOnly=True, includeBondOrder=False, skipFeats=None, 37 trianglePruneBins=True):
38 self.featFactory = featFactory 39 self.useCounts = useCounts 40 self.minPointCount = minPointCount 41 self.maxPointCount = maxPointCount 42 self.shortestPathsOnly = shortestPathsOnly 43 self.includeBondOrder = includeBondOrder 44 self.trianglePruneBins = trianglePruneBins 45 if skipFeats is None: 46 self.skipFeats = [] 47 else: 48 self.skipFeats = skipFeats 49 self._bins = None 50 self.sigKlass = None
51
52 - def SetBins(self, bins):
53 """ bins should be a list of 2-tuples """ 54 self._bins = copy.copy(bins) 55 self.Init()
56
57 - def GetBins(self):
58 return self._bins
59
60 - def GetNumBins(self):
61 return len(self._bins)
62
63 - def GetSignature(self):
64 return self.sigKlass(self._sigSize)
65
66 - def _GetBitSummaryData(self, bitIdx):
67 nPts, combo, scaffold = self.GetBitInfo(bitIdx) 68 fams = self.GetFeatFamilies() 69 labels = [fams[x] for x in combo] 70 dMat = numpy.zeros((nPts, nPts), numpy.int) 71 dVect = Utils.nPointDistDict[nPts] 72 for idx in range(len(dVect)): 73 i, j = dVect[idx] 74 dMat[i, j] = scaffold[idx] 75 dMat[j, i] = scaffold[idx] 76 77 return nPts, combo, scaffold, labels, dMat
78
79 - def GetBitDescriptionAsText(self, bitIdx, includeBins=0, fullPage=1):
80 """ returns text with a description of the bit 81 82 **Arguments** 83 84 - bitIdx: an integer bit index 85 86 - includeBins: (optional) if nonzero, information about the bins will be 87 included as well 88 89 - fullPage: (optional) if nonzero, html headers and footers will 90 be included (so as to make the output a complete page) 91 92 **Returns** 93 94 a string with the HTML 95 96 """ 97 raise NotImplementedError('Missing implementation')
98
99 - def GetBitDescription(self, bitIdx):
100 """ returns a text description of the bit 101 102 **Arguments** 103 104 - bitIdx: an integer bit index 105 106 **Returns** 107 108 a string 109 110 """ 111 nPts, combo, scaffold, labels, dMat = self._GetBitSummaryData(bitIdx) 112 res = " ".join(labels) + " " 113 for row in dMat: 114 res += "|" + " ".join([str(x) for x in row]) 115 res += "|" 116 return res
117
118 - def _findBinIdx(self, dists, bins, scaffolds):
119 """ OBSOLETE: this has been rewritten in C++ 120 Internal use only 121 Returns the index of a bin defined by a set of distances. 122 123 **Arguments** 124 125 - dists: a sequence of distances (not binned) 126 127 - bins: a sorted sequence of distance bins (2-tuples) 128 129 - scaffolds: a list of possible scaffolds (bin combinations) 130 131 **Returns** 132 133 an integer bin index 134 135 **Note** 136 137 the value returned here is not an index in the overall 138 signature. It is, rather, an offset of a scaffold in the 139 possible combinations of distance bins for a given 140 proto-pharmacophore. 141 142 """ 143 nDists = len(dists) 144 whichBins = [0] * nDists 145 146 # This would be a ton easier if we had contiguous bins 147 # i.e. if we could maintain the bins as a list of bounds) 148 # because then we could use Python's bisect module. 149 # Since we can't do that, we've got to do our own binary 150 # search here. 151 for i in range(nDists): 152 dist = dists[i] 153 where = -1 154 155 # do a simple binary search: 156 startP, endP = 0, len(bins) 157 while startP < endP: 158 midP = (startP + endP) // 2 159 begBin, endBin = bins[midP] 160 if dist < begBin: 161 endP = midP 162 elif dist >= endBin: 163 startP = midP + 1 164 else: 165 where = midP 166 break 167 if where < 0: 168 return None 169 whichBins[i] = where 170 res = scaffolds.index(tuple(whichBins)) 171 if _verbose: 172 print('----- _fBI -----------') 173 print(' scaffolds:', scaffolds) 174 print(' bins:', whichBins) 175 print(' res:', res) 176 return res
177
178 - def GetFeatFamilies(self):
179 fams = [fam for fam in self.featFactory.GetFeatureFamilies() if fam not in self.skipFeats] 180 fams.sort() 181 return fams
182
183 - def GetMolFeats(self, mol):
184 featFamilies = self.GetFeatFamilies() 185 featMatches = {} 186 for fam in featFamilies: 187 featMatches[fam] = [] 188 feats = self.featFactory.GetFeaturesForMol(mol, includeOnly=fam) 189 for feat in feats: 190 featMatches[fam].append(feat.GetAtomIds()) 191 return [featMatches[x] for x in featFamilies]
192
193 - def GetBitIdx(self, featIndices, dists, sortIndices=True):
194 """ returns the index for a pharmacophore described using a set of 195 feature indices and distances 196 197 **Arguments*** 198 199 - featIndices: a sequence of feature indices 200 201 - dists: a sequence of distance between the features, only the 202 unique distances should be included, and they should be in the 203 order defined in Utils. 204 205 - sortIndices : sort the indices 206 207 **Returns** 208 209 the integer bit index 210 211 """ 212 nPoints = len(featIndices) 213 if nPoints > 3: 214 raise NotImplementedError('>3 points not supported') 215 if nPoints < self.minPointCount: 216 raise IndexError('bad number of points') 217 if nPoints > self.maxPointCount: 218 raise IndexError('bad number of points') 219 220 # this is the start of the nPoint-point pharmacophores 221 startIdx = self._starts[nPoints] 222 223 # 224 # now we need to map the pattern indices to an offset from startIdx 225 # 226 if sortIndices: 227 tmp = list(featIndices) 228 tmp.sort() 229 featIndices = tmp 230 231 if featIndices[0] < 0: 232 raise IndexError('bad feature index') 233 if max(featIndices) >= self._nFeats: 234 raise IndexError('bad feature index') 235 236 if nPoints == 3: 237 featIndices, dists = Utils.OrderTriangle(featIndices, dists) 238 239 offset = Utils.CountUpTo(self._nFeats, nPoints, featIndices) 240 if _verbose: 241 print('offset for feature %s: %d' % (str(featIndices), offset)) 242 offset *= len(self._scaffolds[len(dists)]) 243 244 try: 245 if _verbose: 246 print('>>>>>>>>>>>>>>>>>>>>>>>') 247 print('\tScaffolds:', repr(self._scaffolds[len(dists)]), type(self._scaffolds[len(dists)])) 248 print('\tDists:', repr(dists), type(dists)) 249 print('\tbins:', repr(self._bins), type(self._bins)) 250 bin_ = self._findBinIdx(dists, self._bins, self._scaffolds[len(dists)]) 251 except ValueError: 252 fams = self.GetFeatFamilies() 253 fams = [fams[x] for x in featIndices] 254 raise IndexError('distance bin not found: feats: %s; dists=%s; bins=%s; scaffolds: %s' % 255 (fams, dists, self._bins, self._scaffolds)) 256 257 return startIdx + offset + bin_
258
259 - def GetBitInfo(self, idx):
260 """ returns information about the given bit 261 262 **Arguments** 263 264 - idx: the bit index to be considered 265 266 **Returns** 267 268 a 3-tuple: 269 270 1) the number of points in the pharmacophore 271 272 2) the proto-pharmacophore (tuple of pattern indices) 273 274 3) the scaffold (tuple of distance indices) 275 276 """ 277 if idx >= self._sigSize: 278 raise IndexError('bad index (%d) queried. %d is the max' % (idx, self._sigSize)) 279 # first figure out how many points are in the p'cophore 280 nPts = self.minPointCount 281 while nPts < self.maxPointCount and self._starts[nPts + 1] <= idx: 282 nPts += 1 283 284 # how far are we in from the start point? 285 offsetFromStart = idx - self._starts[nPts] 286 if _verbose: 287 print('\t %d Points, %d offset' % (nPts, offsetFromStart)) 288 289 # lookup the number of scaffolds 290 nDists = len(Utils.nPointDistDict[nPts]) 291 scaffolds = self._scaffolds[nDists] 292 293 nScaffolds = len(scaffolds) 294 295 # figure out to which proto-pharmacophore we belong: 296 protoIdx = offsetFromStart // nScaffolds 297 indexCombos = Utils.GetIndexCombinations(self._nFeats, nPts) 298 combo = tuple(indexCombos[protoIdx]) 299 if _verbose: 300 print('\t combo: %s' % (str(combo))) 301 302 # and which scaffold: 303 scaffoldIdx = offsetFromStart % nScaffolds 304 scaffold = scaffolds[scaffoldIdx] 305 if _verbose: 306 print('\t scaffold: %s' % (str(scaffold))) 307 return nPts, combo, scaffold
308
309 - def Init(self):
310 """ Initializes internal parameters. This **must** be called after 311 making any changes to the signature parameters 312 313 """ 314 accum = 0 315 self._scaffolds = [0] * (len(Utils.nPointDistDict[self.maxPointCount + 1])) 316 self._starts = {} 317 if not self.skipFeats: 318 self._nFeats = len(self.featFactory.GetFeatureFamilies()) 319 else: 320 self._nFeats = 0 321 for fam in self.featFactory.GetFeatureFamilies(): 322 if fam not in self.skipFeats: 323 self._nFeats += 1 324 for i in range(self.minPointCount, self.maxPointCount + 1): 325 self._starts[i] = accum 326 nDistsHere = len(Utils.nPointDistDict[i]) 327 scaffoldsHere = Utils.GetPossibleScaffolds(i, self._bins, 328 useTriangleInequality=self.trianglePruneBins) 329 nBitsHere = len(scaffoldsHere) 330 self._scaffolds[nDistsHere] = scaffoldsHere 331 pointsHere = Utils.NumCombinations(self._nFeats, i) * nBitsHere 332 accum += pointsHere 333 self._sigSize = accum 334 if not self.useCounts: 335 self.sigKlass = SparseBitVect 336 elif self._sigSize < 2**31: 337 self.sigKlass = IntSparseIntVect 338 else: 339 self.sigKlass = LongSparseIntVect
340
341 - def GetSigSize(self):
342 return self._sigSize
343 344 345 try: 346 from rdkit.Chem.Pharmacophores import cUtils 347 except ImportError: 348 pass 349 else: 350 SigFactory._findBinIdx = cUtils.FindBinIdx 351