Package rdkit :: Package ML :: Package NaiveBayes :: Module ClassificationModel
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.NaiveBayes.ClassificationModel

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2004-2008 Greg Landrum and Rational Discovery LLC 
  4  #      All Rights Reserved 
  5  # 
  6  """ Defines Naive Baysean classification model 
  7     Based on development in: Chapter 6 of "Machine Learning" by Tom Mitchell 
  8   
  9  """ 
 10  import numpy 
 11  from rdkit.ML.Data import Quantize 
 12  from rdkit.six import iteritems 
 13   
 14   
15 -def _getBinId(val, qBounds):
16 bid = 0 17 for bnd in qBounds: 18 if (val > bnd): 19 bid += 1 20 return bid
21 22 23 # FIX: this class has not been updated to new-style classes 24 # (RD Issue380) because that would break all of our legacy pickled 25 # data. Until a solution is found for this breakage, an update is 26 # impossible.
27 -class NaiveBayesClassifier:
28 """ 29 _NaiveBayesClassifier_s can save the following pieces of internal state, accessible via 30 standard setter/getter functions: 31 32 1) _Examples_: a list of examples which have been predicted 33 34 2) _TrainingExamples_: List of training examples - the descriptor value of these examples 35 are quantized based on info gain using ML/Data/Quantize.py if necessary 36 37 3) _TestExamples_: the list of examples used to test the model 38 39 4) _BadExamples_ : list of examples that were incorrectly classified 40 41 4) _QBoundVals_: Quant bound values for each varaible - a list of lists 42 43 5) _QBounds_ : Number of bounds for each variable 44 45 """ 46
47 - def __init__(self, attrs, nPossibleVals, nQuantBounds, mEstimateVal=-1.0, useSigs=False):
48 """ Constructor 49 50 """ 51 self._attrs = attrs 52 self._mEstimateVal = mEstimateVal 53 self._useSigs = useSigs 54 55 self._classProbs = {} 56 57 self._examples = [] 58 self._trainingExamples = [] 59 self._testExamples = [] 60 self._badExamples = [] 61 self._QBoundVals = {} 62 self._nClasses = nPossibleVals[-1] 63 self._qBounds = nQuantBounds 64 self._nPosVals = nPossibleVals 65 self._needsQuant = 1 66 67 self._name = "" 68 self.mprob = -1.0 69 70 # for the sake a of efficiency lets try to change the conditional probabities 71 # to a numpy array instead of a dictionary. The three dimension array is indexed 72 # on the the activity class, the discriptor ID and the descriptor binID 73 # self._condProbs = {} 74 # self._condProbs = numpy.zeros((self._nClasses, max(self._attrs)+1, 75 # max(self._nPosVals)+1), 'd') 76 self._condProbs = [None] * self._nClasses 77 for i in range(self._nClasses): 78 if not (hasattr(self, '_useSigs') and self._useSigs): 79 nA = max(self._attrs) + 1 80 self._condProbs[i] = [None] * nA 81 for j in range(nA): 82 nV = self._nPosVals[j] 83 if self._qBounds[j]: 84 nV = max(nV, self._qBounds[j] + 1) 85 self._condProbs[i][j] = [0.0] * nV 86 else: 87 self._condProbs[i] = {} 88 for idx in self._attrs: 89 self._condProbs[i][idx] = [0.0] * 2
90
91 - def GetName(self):
92 return self._name
93
94 - def SetName(self, name):
95 self._name = name
96
97 - def NameModel(self, varNames):
98 self.SetName('NaiveBayesClassifier')
99
100 - def GetExamples(self):
101 return self._examples
102
103 - def SetExamples(self, examples):
104 self._examples = examples
105
106 - def GetTrainingExamples(self):
107 return self._trainingExamples
108
109 - def SetTrainingExamples(self, examples):
110 self._trainingExamples = examples
111
112 - def GetTestExamples(self):
113 return self._testExamples
114
115 - def SetTestExamples(self, examples):
116 self._testExamples = examples
117
118 - def SetBadExamples(self, examples):
119 self._badExamples = examples
120
121 - def GetBadExamples(self):
122 return self._badExamples
123
124 - def _computeQuantBounds(self):
125 neg = len(self._trainingExamples) 126 natr = len(self._attrs) 127 128 # make a list of results and values 129 allVals = numpy.zeros((neg, natr), 'd') 130 res = [] # list of y values 131 i = 0 132 for eg in self._trainingExamples: 133 res.append(eg[-1]) 134 j = 0 135 for ai in self._attrs: 136 val = eg[ai] 137 allVals[i, j] = val 138 j += 1 139 i += 1 140 141 # now loop over each of the columns and compute the bounds 142 # the number of bounds is determined by the maximum info gain 143 i = 0 144 for ai in self._attrs: 145 nbnds = self._qBounds[ai] 146 if nbnds > 0: 147 mbnds = [] 148 mgain = -1.0 149 150 for j in range(1, nbnds + 1): 151 bnds, igain = Quantize.FindVarMultQuantBounds(allVals[:, i], j, res, self._nClasses) 152 if (igain > mgain): 153 mbnds = bnds 154 mgain = igain 155 self._QBoundVals[ai] = mbnds 156 i += 1
157
158 - def trainModel(self):
159 """ We will assume at this point that the training examples have been set 160 161 We have to estmate the conditional probabilities for each of the (binned) descriptor 162 component give a outcome (or class). Also the probabilities for each class is estimated 163 """ 164 # first estimate the class probabilities 165 n = len(self._trainingExamples) 166 for i in range(self._nClasses): 167 self._classProbs[i] = 0.0 168 169 # for i in range(self._nClasses): 170 # self._classProbs[i] = float(self._classProbs[i])/n 171 172 # first find the bounds for each descriptor value if necessary 173 if not self._useSigs and max(self._qBounds) > 0: 174 self._computeQuantBounds() 175 176 # now compute the probabilities 177 ncls = {} 178 179 incr = 1.0 / n 180 for eg in self._trainingExamples: 181 cls = eg[-1] 182 self._classProbs[cls] += incr 183 ncls[cls] = ncls.get(cls, 0) + 1 184 tmp = self._condProbs[cls] 185 if not self._useSigs: 186 for ai in self._attrs: 187 bid = eg[ai] 188 if self._qBounds[ai] > 0: 189 bid = _getBinId(bid, self._QBoundVals[ai]) 190 tmp[ai][bid] += 1.0 191 else: 192 for ai in self._attrs: 193 if eg[1].GetBit(ai): 194 tmp[ai][1] += 1.0 195 else: 196 tmp[ai][0] += 1.0 197 198 # for key in self._condProbs: 199 for cls in range(self._nClasses): 200 if cls not in ncls: 201 continue 202 # cls = key[0] 203 tmp = self._condProbs[cls] 204 for ai in self._attrs: 205 if not self._useSigs: 206 nbnds = self._nPosVals[ai] 207 if (self._qBounds[ai] > 0): 208 nbnds = self._qBounds[ai] 209 else: 210 nbnds = 2 211 for bid in range(nbnds): 212 if self._mEstimateVal <= 0.0: 213 # this is simple the fraction of of time this descriptor component assume 214 # this value for the examples that belong a specific class 215 # self._condProbs[key] = (float(self._condProbs[key]))/ncls[cls] 216 tmp[ai][bid] /= ncls[cls] 217 else: 218 # this a bit more complicated form - more appropriate for unbalanced data 219 # see "Machine Learning" by Tom Mitchell section 6.9.1.1 220 221 # this is the probability that this descriptor component can take this specific value 222 # in the lack of any other information is is simply the inverse of the number of 223 # possible values 'npossible' 224 # If we quantized this component then 225 # npossible = 1 + len(self._QBoundVals[ai]) 226 # else if we did no qunatize (the descriptor came quantized) 227 # npossible = nPossibleVals[ai] 228 # ai = key[1] 229 pdesc = 0.0 230 if self._qBounds[ai] > 0: 231 pdesc = 1.0 / (1 + len(self._QBoundVals[ai])) 232 elif (self._nPosVals[ai] > 0): 233 pdesc = 1.0 / (self._nPosVals[ai]) 234 else: 235 raise ValueError('Neither Bounds set nor data pre-quantized for attribute ' + str(ai)) 236 tmp[ai][bid] += (self._mEstimateVal) * pdesc 237 tmp[ai][bid] /= (ncls[cls] + self._mEstimateVal)
238
239 - def ClassifyExamples(self, examples, appendExamples=0):
240 preds = [] 241 for eg in examples: 242 pred = self.ClassifyExample(eg, appendExamples) 243 preds.append(int(pred)) 244 return preds
245
246 - def GetClassificationDetails(self):
247 """ returns the probability of the last prediction """ 248 return self.mprob
249
250 - def ClassifyExample(self, example, appendExamples=0):
251 """ Classify an example by summing over the conditional probabilities 252 The most likely class is the one with the largest probability 253 """ 254 if appendExamples: 255 self._examples.append(example) 256 clsProb = {} 257 for key, prob in iteritems(self._classProbs): 258 clsProb[key] = prob 259 tmp = self._condProbs[key] 260 for ai in self._attrs: 261 if not (hasattr(self, '_useSigs') and self._useSigs): 262 bid = example[ai] 263 if self._qBounds[ai] > 0: 264 bid = _getBinId(bid, self._QBoundVals[ai]) 265 else: 266 if example[1].GetBit(ai): 267 bid = 1 268 else: 269 bid = 0 270 clsProb[key] *= tmp[ai][bid] 271 272 mkey = -1 273 self.mprob = -1.0 274 for key, prob in iteritems(clsProb): 275 if (prob > self.mprob): 276 mkey = key 277 self.mprob = prob 278 279 return mkey
280