1
2
3
4 """ ID3 Decision Trees
5
6 contains an implementation of the ID3 decision tree algorithm
7 as described in Tom Mitchell's book "Machine Learning"
8
9 It relies upon the _Tree.TreeNode_ data structure (or something
10 with the same API) defined locally to represent the trees
11
12 """
13
14 import numpy
15
16 from rdkit.ML.DecTree import DecTree
17 from rdkit.ML.InfoTheory import entropy
18
19
21 """ Calculates the total entropy of the data set (w.r.t. the results)
22
23 **Arguments**
24
25 - examples: a list (nInstances long) of lists of variable values + instance
26 values
27 - nPossibleVals: a list (nVars long) of the number of possible values each variable
28 can adopt.
29
30 **Returns**
31
32 a float containing the informational entropy of the data set.
33
34 """
35 nRes = nPossibleVals[-1]
36 resList = numpy.zeros(nRes, 'i')
37 for example in examples:
38 res = int(example[-1])
39 resList[res] += 1
40 return entropy.InfoEntropy(resList)
41
42
44 """Generates a list of variable tables for the examples passed in.
45
46 The table for a given variable records the number of times each possible value
47 of that variable appears for each possible result of the function.
48
49 **Arguments**
50
51 - examples: a list (nInstances long) of lists of variable values + instance
52 values
53
54 - nPossibleVals: a list containing the number of possible values of
55 each variable + the number of values of the function.
56
57 - vars: a list of the variables to include in the var table
58
59
60 **Returns**
61
62 a list of variable result tables. Each table is a Numeric array
63 which is varValues x nResults
64 """
65 nVars = len(vars)
66 res = [None] * nVars
67 nFuncVals = nPossibleVals[-1]
68
69 for i in range(nVars):
70 res[i] = numpy.zeros((nPossibleVals[vars[i]], nFuncVals), 'i')
71 for example in examples:
72 val = int(example[-1])
73 for i in range(nVars):
74 res[i][int(example[vars[i]]), val] += 1
75
76 return res
77
78
79 -def ID3(examples, target, attrs, nPossibleVals, depth=0, maxDepth=-1, **kwargs):
80 """ Implements the ID3 algorithm for constructing decision trees.
81
82 From Mitchell's book, page 56
83
84 This is *slightly* modified from Mitchell's book because it supports
85 multivalued (non-binary) results.
86
87 **Arguments**
88
89 - examples: a list (nInstances long) of lists of variable values + instance
90 values
91
92 - target: an int
93
94 - attrs: a list of ints indicating which variables can be used in the tree
95
96 - nPossibleVals: a list containing the number of possible values of
97 every variable.
98
99 - depth: (optional) the current depth in the tree
100
101 - maxDepth: (optional) the maximum depth to which the tree
102 will be grown
103
104 **Returns**
105
106 a DecTree.DecTreeNode with the decision tree
107
108 **NOTE:** This code cannot bootstrap (start from nothing...)
109 use _ID3Boot_ (below) for that.
110 """
111 varTable = GenVarTable(examples, nPossibleVals, attrs)
112 tree = DecTree.DecTreeNode(None, 'node')
113
114
115 totEntropy = CalcTotalEntropy(examples, nPossibleVals)
116 tree.SetData(totEntropy)
117
118
119
120 tMat = GenVarTable(examples, nPossibleVals, [target])[0]
121
122 counts = sum(tMat)
123 nzCounts = numpy.nonzero(counts)[0]
124
125 if len(nzCounts) == 1:
126
127
128
129 res = nzCounts[0]
130 tree.SetLabel(res)
131 tree.SetName(str(res))
132 tree.SetTerminal(1)
133 elif len(attrs) == 0 or (maxDepth >= 0 and depth >= maxDepth):
134
135
136
137
138 v = numpy.argmax(counts)
139 tree.SetLabel(v)
140 tree.SetName('%d?' % v)
141 tree.SetTerminal(1)
142 else:
143
144
145 gains = [entropy.InfoGain(x) for x in varTable]
146 best = attrs[numpy.argmax(gains)]
147
148
149 nextAttrs = attrs[:]
150 if not kwargs.get('recycleVars', 0):
151 nextAttrs.remove(best)
152
153
154 tree.SetName('Var: %d' % best)
155 tree.SetLabel(best)
156
157 tree.SetTerminal(0)
158
159
160
161 for val in range(nPossibleVals[best]):
162 nextExamples = []
163 for example in examples:
164 if example[best] == val:
165 nextExamples.append(example)
166 if len(nextExamples) == 0:
167
168
169
170 v = numpy.argmax(counts)
171 tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
172 else:
173
174 tree.AddChildNode(
175 ID3(nextExamples, best, nextAttrs, nPossibleVals, depth + 1, maxDepth, **kwargs))
176 return tree
177
178
179 -def ID3Boot(examples, attrs, nPossibleVals, initialVar=None, depth=0, maxDepth=-1, **kwargs):
180 """ Bootstrapping code for the ID3 algorithm
181
182 see ID3 for descriptions of the arguments
183
184 If _initialVar_ is not set, the algorithm will automatically
185 choose the first variable in the tree (the standard greedy
186 approach). Otherwise, _initialVar_ will be used as the first
187 split.
188
189 """
190 totEntropy = CalcTotalEntropy(examples, nPossibleVals)
191 varTable = GenVarTable(examples, nPossibleVals, attrs)
192
193 tree = DecTree.DecTreeNode(None, 'node')
194
195 tree._nResultCodes = nPossibleVals[-1]
196
197
198
199 if initialVar is None:
200 best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])]
201 else:
202 best = initialVar
203
204 tree.SetName('Var: %d' % best)
205 tree.SetData(totEntropy)
206 tree.SetLabel(best)
207 tree.SetTerminal(0)
208 nextAttrs = list(attrs)
209 if not kwargs.get('recycleVars', 0):
210 nextAttrs.remove(best)
211
212 for val in range(nPossibleVals[best]):
213 nextExamples = []
214 for example in examples:
215 if example[best] == val:
216 nextExamples.append(example)
217
218 tree.AddChildNode(ID3(nextExamples, best, nextAttrs, nPossibleVals, depth, maxDepth, **kwargs))
219 return tree
220