1
2
3
4
5
6 """ Defines Naive Baysean classification model
7 Based on development in: Chapter 6 of "Machine Learning" by Tom Mitchell
8
9 """
10 import numpy
11 from rdkit.ML.Data import Quantize
12 from rdkit.six import iteritems
13
14
16 bid = 0
17 for bnd in qBounds:
18 if (val > bnd):
19 bid += 1
20 return bid
21
22
23
24
25
26
28 """
29 _NaiveBayesClassifier_s can save the following pieces of internal state, accessible via
30 standard setter/getter functions:
31
32 1) _Examples_: a list of examples which have been predicted
33
34 2) _TrainingExamples_: List of training examples - the descriptor value of these examples
35 are quantized based on info gain using ML/Data/Quantize.py if necessary
36
37 3) _TestExamples_: the list of examples used to test the model
38
39 4) _BadExamples_ : list of examples that were incorrectly classified
40
41 4) _QBoundVals_: Quant bound values for each varaible - a list of lists
42
43 5) _QBounds_ : Number of bounds for each variable
44
45 """
46
47 - def __init__(self, attrs, nPossibleVals, nQuantBounds, mEstimateVal=-1.0, useSigs=False):
48 """ Constructor
49
50 """
51 self._attrs = attrs
52 self._mEstimateVal = mEstimateVal
53 self._useSigs = useSigs
54
55 self._classProbs = {}
56
57 self._examples = []
58 self._trainingExamples = []
59 self._testExamples = []
60 self._badExamples = []
61 self._QBoundVals = {}
62 self._nClasses = nPossibleVals[-1]
63 self._qBounds = nQuantBounds
64 self._nPosVals = nPossibleVals
65 self._needsQuant = 1
66
67 self._name = ""
68 self.mprob = -1.0
69
70
71
72
73
74
75
76 self._condProbs = [None] * self._nClasses
77 for i in range(self._nClasses):
78 if not (hasattr(self, '_useSigs') and self._useSigs):
79 nA = max(self._attrs) + 1
80 self._condProbs[i] = [None] * nA
81 for j in range(nA):
82 nV = self._nPosVals[j]
83 if self._qBounds[j]:
84 nV = max(nV, self._qBounds[j] + 1)
85 self._condProbs[i][j] = [0.0] * nV
86 else:
87 self._condProbs[i] = {}
88 for idx in self._attrs:
89 self._condProbs[i][idx] = [0.0] * 2
90
93
96
98 self.SetName('NaiveBayesClassifier')
99
101 return self._examples
102
104 self._examples = examples
105
107 return self._trainingExamples
108
110 self._trainingExamples = examples
111
113 return self._testExamples
114
116 self._testExamples = examples
117
119 self._badExamples = examples
120
122 return self._badExamples
123
125 neg = len(self._trainingExamples)
126 natr = len(self._attrs)
127
128
129 allVals = numpy.zeros((neg, natr), 'd')
130 res = []
131 i = 0
132 for eg in self._trainingExamples:
133 res.append(eg[-1])
134 j = 0
135 for ai in self._attrs:
136 val = eg[ai]
137 allVals[i, j] = val
138 j += 1
139 i += 1
140
141
142
143 i = 0
144 for ai in self._attrs:
145 nbnds = self._qBounds[ai]
146 if nbnds > 0:
147 mbnds = []
148 mgain = -1.0
149
150 for j in range(1, nbnds + 1):
151 bnds, igain = Quantize.FindVarMultQuantBounds(allVals[:, i], j, res, self._nClasses)
152 if (igain > mgain):
153 mbnds = bnds
154 mgain = igain
155 self._QBoundVals[ai] = mbnds
156 i += 1
157
159 """ We will assume at this point that the training examples have been set
160
161 We have to estmate the conditional probabilities for each of the (binned) descriptor
162 component give a outcome (or class). Also the probabilities for each class is estimated
163 """
164
165 n = len(self._trainingExamples)
166 for i in range(self._nClasses):
167 self._classProbs[i] = 0.0
168
169
170
171
172
173 if not self._useSigs and max(self._qBounds) > 0:
174 self._computeQuantBounds()
175
176
177 ncls = {}
178
179 incr = 1.0 / n
180 for eg in self._trainingExamples:
181 cls = eg[-1]
182 self._classProbs[cls] += incr
183 ncls[cls] = ncls.get(cls, 0) + 1
184 tmp = self._condProbs[cls]
185 if not self._useSigs:
186 for ai in self._attrs:
187 bid = eg[ai]
188 if self._qBounds[ai] > 0:
189 bid = _getBinId(bid, self._QBoundVals[ai])
190 tmp[ai][bid] += 1.0
191 else:
192 for ai in self._attrs:
193 if eg[1].GetBit(ai):
194 tmp[ai][1] += 1.0
195 else:
196 tmp[ai][0] += 1.0
197
198
199 for cls in range(self._nClasses):
200 if cls not in ncls:
201 continue
202
203 tmp = self._condProbs[cls]
204 for ai in self._attrs:
205 if not self._useSigs:
206 nbnds = self._nPosVals[ai]
207 if (self._qBounds[ai] > 0):
208 nbnds = self._qBounds[ai]
209 else:
210 nbnds = 2
211 for bid in range(nbnds):
212 if self._mEstimateVal <= 0.0:
213
214
215
216 tmp[ai][bid] /= ncls[cls]
217 else:
218
219
220
221
222
223
224
225
226
227
228
229 pdesc = 0.0
230 if self._qBounds[ai] > 0:
231 pdesc = 1.0 / (1 + len(self._QBoundVals[ai]))
232 elif (self._nPosVals[ai] > 0):
233 pdesc = 1.0 / (self._nPosVals[ai])
234 else:
235 raise ValueError('Neither Bounds set nor data pre-quantized for attribute ' + str(ai))
236 tmp[ai][bid] += (self._mEstimateVal) * pdesc
237 tmp[ai][bid] /= (ncls[cls] + self._mEstimateVal)
238
240 preds = []
241 for eg in examples:
242 pred = self.ClassifyExample(eg, appendExamples)
243 preds.append(int(pred))
244 return preds
245
247 """ returns the probability of the last prediction """
248 return self.mprob
249
251 """ Classify an example by summing over the conditional probabilities
252 The most likely class is the one with the largest probability
253 """
254 if appendExamples:
255 self._examples.append(example)
256 clsProb = {}
257 for key, prob in iteritems(self._classProbs):
258 clsProb[key] = prob
259 tmp = self._condProbs[key]
260 for ai in self._attrs:
261 if not (hasattr(self, '_useSigs') and self._useSigs):
262 bid = example[ai]
263 if self._qBounds[ai] > 0:
264 bid = _getBinId(bid, self._QBoundVals[ai])
265 else:
266 if example[1].GetBit(ai):
267 bid = 1
268 else:
269 bid = 0
270 clsProb[key] *= tmp[ai][bid]
271
272 mkey = -1
273 self.mprob = -1.0
274 for key, prob in iteritems(clsProb):
275 if (prob > self.mprob):
276 mkey = key
277 self.mprob = prob
278
279 return mkey
280