CrossEntropy.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Error measure for classification tasks that can be used
6  * as the objective function for training.
7  *
8  *
9  *
10  *
11  * \author -
12  * \date -
13  *
14  *
15  * \par Copyright 1995-2015 Shark Development Team
16  *
17  * <BR><HR>
18  * This file is part of Shark.
19  * <http://image.diku.dk/shark/>
20  *
21  * Shark is free software: you can redistribute it and/or modify
22  * it under the terms of the GNU Lesser General Public License as published
23  * by the Free Software Foundation, either version 3 of the License, or
24  * (at your option) any later version.
25  *
26  * Shark is distributed in the hope that it will be useful,
27  * but WITHOUT ANY WARRANTY; without even the implied warranty of
28  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29  * GNU Lesser General Public License for more details.
30  *
31  * You should have received a copy of the GNU Lesser General Public License
32  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
33  *
34  */
35 
36 #ifndef SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
37 #define SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
38 
40 
41 namespace shark{
42 
43 /*!
44  * \brief Error measure for classication tasks that can be used
45  * as the objective function for training.
46  *
47  * If your model should return a vector whose components reflect the
48  * logarithmic conditonal probabilities of class membership given any input vector
49  * 'CrossEntropy' is the adequate error measure for model-training.
50  * For \em C>1 classes the loss function is defined as
51  * \f[
52  * E = - \ln \frac{\exp{x_c}} {\sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}} = - x_c + \ln \sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}
53  * \f]
54  * where \em x is the prediction vector of the model and \em c is the class label. In the case of only one
55  * model output and binary classification, another more numerically stable formulation is used:
56  * \f[
57  * E = \ln(1+ e^{-yx})
58  * \f]
59  * here, \em y are class labels between -1 and 1 and y = -2 c+1. The reason why this is numerically more stable is,
60  * that when \f$ e^{-yx} \f$ is big, the error function is well approximated by the linear function \em x. Also if
61  * the exponential is very small, the case \f$ \ln(0) \f$ is avoided.
62  *
63  * The class labels must be integers starting from 0. Also for theoretical reasons, the output neurons of a neural
64  * Network must be linear.
65  */
66 class CrossEntropy : public AbstractLoss<unsigned int,RealVector>
67 {
68 private:
70 
71  //uses different formula to compute the binary case for 1 output.
72  //should be numerically more stable
73  //formula: ln(1+exp(-yx)) with y = -1/1
74  double evalError(double label,double exponential,double value) const {
75 
76  if(value*label < -200 ){
77  //below this, we might get numeric instabilities
78  //but we know, that ln(1+exp(x)) converges to x for big arguments
79  return - value * label;
80  }
81  return std::log(1+exponential);
82  }
83 public:
85  {
87  //~ m_features |= HAS_SECOND_DERIVATIVE;
88  }
89 
90 
91  /// \brief From INameable: return the class name.
92  std::string name() const
93  { return "CrossEntropy"; }
94 
95  // annoyingness of C++ templates
96  using base_type::eval;
97 
98  double eval(UIntVector const& target, RealMatrix const& prediction) const {
99  if ( prediction.size2() == 1 )
100  {
101  double error = 0;
102  for(std::size_t i = 0; i != prediction.size1(); ++i){
103  RANGE_CHECK ( target(i) < 2 );
104  double label = 2 * static_cast<double>(target(i)) - 1; //converts labels from 0/1 to -1/1
105  double exponential = std::exp ( -label * prediction ( i,0 ) );
106  error+= evalError(label,exponential,prediction (i, 0 ));
107  }
108  return error;
109  }
110  else
111  {
112  double error = 0;
113  for(std::size_t i = 0; i != prediction.size1(); ++i){
114  RANGE_CHECK ( target(i) < prediction.size2() );
115 
116  //calculate the log norm in a numerically stable way
117  //we subtract the maximum prior to exponentiation to
118  //ensure that the exponntiation result will still fit in double
119  double maximum = max(row(prediction,i));
120  double logNorm = 0;
121  for( std::size_t j = 0; j != prediction.size2(); ++j)
122  {
123  double term = std::exp(prediction(i,j)-maximum);
124  logNorm += term;
125  }
126  logNorm = std::log(logNorm)+maximum;
127 
128  error+= logNorm - prediction(i,target(i));
129  }
130  return error;
131  }
132  }
133 
134  double evalDerivative(UIntVector const& target, RealMatrix const& prediction, RealMatrix& gradient) const {
135  gradient.resize(prediction.size1(),prediction.size2());
136  if ( prediction.size2() == 1 )
137  {
138  double error = 0;
139  for(std::size_t i = 0; i != prediction.size1(); ++i){
140  RANGE_CHECK ( target(i) < 2 );
141  double label = 2 * static_cast<double>(target(i)) - 1; //converts labels from 0/1 to -1/1
142  double exponential = std::exp ( -label * prediction (i, 0 ) );
143  double sigmoid = 1.0/(1.0+exponential);
144  gradient ( i,0 ) = -label * (1.0 - sigmoid);
145  error+=evalError(label,exponential,prediction (i, 0 ));
146  }
147  return error;
148  }
149  else
150  {
151  double error = 0;
152  for(std::size_t i = 0; i != prediction.size1(); ++i){
153  RANGE_CHECK ( target(i) < prediction.size2() );
154  RealMatrixRow gradRow=row(gradient,i);
155 
156  //calculate the log norm in a numerically stable way
157  //we subtract the maximum prior to exponentiation to
158  //ensure that the exponntiation result will still fit in double
159  //this does not change the result as the values get normalized by
160  //their sum and thus the correction term cancels out.
161  double maximum = max(row(prediction,i));
162  noalias(gradRow) = exp(row(prediction,i) - maximum);
163  double norm = sum(gradRow);
164  gradRow/=norm;
165  gradient(i,target(i)) -= 1;
166  error+=std::log(norm) - prediction(i,target(i))+maximum;
167  }
168  return error;
169  }
170  }
171 
172  double evalDerivative (
173  unsigned int const& target,
174  RealVector const& prediction,
175  RealVector& gradient,
176  RealMatrix& hessian
177  ) const{
178  gradient.resize(prediction.size());
179  hessian.resize(prediction.size(),prediction.size());
180  if ( prediction.size() == 1 )
181  {
182  RANGE_CHECK ( target < 2 );
183  double label = 2 * static_cast<double>(target) - 1; //converts labels from 0/1 to -1/1
184  double exponential = std::exp ( -label * prediction ( 0 ) );
185  double sigmoid = 1.0/(1.0+exponential);
186  gradient ( 0 ) = -label * (1.0-sigmoid);
187  hessian ( 0,0 ) = sigmoid * ( 1-sigmoid );
188  return evalError(label,exponential,prediction ( 0 ));
189  }
190  else
191  {
192  RANGE_CHECK ( target < prediction.size() );
193  //calculate the log norm in a numerically stable way
194  //we subtract the maximum prior to exponentiation to
195  //ensure that the exponntiation result will still fit in double
196  //this does not change the result as the values get normalized by
197  //their sum and thus the correction term cancels out.
198  double maximum = max(prediction);
199  noalias(gradient) = exp(prediction-maximum);
200  double norm = sum(gradient);
201  gradient/=norm;
202 
203  noalias(hessian)=-outer_prod(gradient,gradient);
204  noalias(diag(hessian)) += gradient;
205  gradient(target) -= 1;
206 
207  return std::log(norm) - prediction(target) - maximum;
208  }
209  }
210 };
211 
212 
213 }
214 #endif