Package rdkit :: Package ML :: Package InfoTheory :: Module entropy
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.InfoTheory.entropy

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  # 
  4  """ Informational Entropy functions 
  5   
  6    The definitions used are the same as those in Tom Mitchell's 
  7    book "Machine Learning" 
  8   
  9  """ 
 10  import math 
 11   
 12  import numpy 
 13   
 14   
 15  # try to get the C versions of these routines 
 16  try: 
 17    import rdkit.ML.InfoTheory.rdInfoTheory as cEntropy 
 18  except ImportError: 
 19    hascEntropy = 0 
 20  else: 
 21    hascEntropy = 1 
 22   
 23  # it's pretty obvious what this is for ;-) 
 24  _log2 = math.log(2) 
 25   
 26   
27 -def PyInfoEntropy(results):
28 """ Calculates the informational entropy of a set of results. 29 30 **Arguments** 31 32 results is a 1D Numeric array containing the number of times a 33 given set hits each possible result. 34 For example, if a function has 3 possible results, and the 35 variable in question hits them 5, 6 and 1 times each, 36 results would be [5,6,1] 37 38 **Returns** 39 40 the informational entropy 41 42 """ 43 nInstances = float(sum(results)) 44 if nInstances == 0: 45 # to return zero or one... that is the question 46 return 0 47 probs = results / nInstances 48 49 # ------- 50 # NOTE: this is a little hack to allow the use of Numeric 51 # functionality to calculate the informational entropy. 52 # The problem is that the system log function pitches a fit 53 # when you call log(0.0). We are perfectly happy with that 54 # returning *anything* because we're gonna mutiply by 0 anyway. 55 56 # Here's the risky (but marginally faster way to do it: 57 # add a small number to probs and hope it doesn't screw 58 # things up too much. 59 # t = probs+1e-10 60 61 # Here's a perfectly safe approach that's a little bit more obfuscated 62 # and a tiny bit slower 63 t = numpy.choose(numpy.greater(probs, 0.0), (1, probs)) 64 return sum(-probs * numpy.log(t) / _log2)
65 66
67 -def PyInfoGain(varMat):
68 """ calculates the information gain for a variable 69 70 **Arguments** 71 72 varMat is a Numeric array with the number of possible occurances 73 of each result for reach possible value of the given variable. 74 75 So, for a variable which adopts 4 possible values and a result which 76 has 3 possible values, varMat would be 4x3 77 78 **Returns** 79 80 The expected information gain 81 """ 82 variableRes = numpy.sum(varMat, 1) # indexed by variable, Sv in Mitchell's notation 83 overallRes = numpy.sum(varMat, 0) # indexed by result, S in Mitchell's notation 84 85 term2 = 0 86 for i in range(len(variableRes)): 87 term2 = term2 + variableRes[i] * InfoEntropy(varMat[i]) 88 tSum = sum(overallRes) 89 if tSum != 0.0: 90 term2 = 1. / tSum * term2 91 gain = InfoEntropy(overallRes) - term2 92 else: 93 gain = 0 94 return gain
95 96 # if we have the C versions, use them, otherwise use the python stuff 97 if hascEntropy: 98 InfoEntropy = cEntropy.InfoEntropy 99 InfoGain = cEntropy.InfoGain 100 else: 101 InfoEntropy = PyInfoEntropy 102 InfoGain = PyInfoGain 103