1
2
3
4 """ Informational Entropy functions
5
6 The definitions used are the same as those in Tom Mitchell's
7 book "Machine Learning"
8
9 """
10 import math
11
12 import numpy
13
14
15
16 try:
17 import rdkit.ML.InfoTheory.rdInfoTheory as cEntropy
18 except ImportError:
19 hascEntropy = 0
20 else:
21 hascEntropy = 1
22
23
24 _log2 = math.log(2)
25
26
28 """ Calculates the informational entropy of a set of results.
29
30 **Arguments**
31
32 results is a 1D Numeric array containing the number of times a
33 given set hits each possible result.
34 For example, if a function has 3 possible results, and the
35 variable in question hits them 5, 6 and 1 times each,
36 results would be [5,6,1]
37
38 **Returns**
39
40 the informational entropy
41
42 """
43 nInstances = float(sum(results))
44 if nInstances == 0:
45
46 return 0
47 probs = results / nInstances
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63 t = numpy.choose(numpy.greater(probs, 0.0), (1, probs))
64 return sum(-probs * numpy.log(t) / _log2)
65
66
68 """ calculates the information gain for a variable
69
70 **Arguments**
71
72 varMat is a Numeric array with the number of possible occurances
73 of each result for reach possible value of the given variable.
74
75 So, for a variable which adopts 4 possible values and a result which
76 has 3 possible values, varMat would be 4x3
77
78 **Returns**
79
80 The expected information gain
81 """
82 variableRes = numpy.sum(varMat, 1)
83 overallRes = numpy.sum(varMat, 0)
84
85 term2 = 0
86 for i in range(len(variableRes)):
87 term2 = term2 + variableRes[i] * InfoEntropy(varMat[i])
88 tSum = sum(overallRes)
89 if tSum != 0.0:
90 term2 = 1. / tSum * term2
91 gain = InfoEntropy(overallRes) - term2
92 else:
93 gain = 0
94 return gain
95
96
97 if hascEntropy:
98 InfoEntropy = cEntropy.InfoEntropy
99 InfoGain = cEntropy.InfoGain
100 else:
101 InfoEntropy = PyInfoEntropy
102 InfoGain = PyInfoGain
103