NegativeGaussianProcessEvidence.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Evidence for model selection of a regularization network/Gaussian process.
6 
7 
8  *
9  *
10  * \author C. Igel, T. Glasmachers, O. Krause
11  * \date 2007-2012
12  *
13  *
14  * \par Copyright 1995-2015 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://image.diku.dk/shark/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 #ifndef SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
37 #define SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
38 
41 
42 #include <shark/LinAlg/Base.h>
44 #include <shark/LinAlg/Cholesky.h>
45 namespace shark {
46 
47 
48 ///
49 /// \brief Evidence for model selection of a regularization network/Gaussian process.
50 ///
51 /// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
52 /// \f$t\f$ the corresponding label vector. For the evidence we have:
53 /// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi)] \f]
54 ///
55 /// The evidence is also known as marginal (log)likelihood. For
56 /// details, please see:
57 ///
58 /// C.E. Rasmussen & C.K.I. Williams, Gaussian
59 /// Processes for Machine Learning, section 5.4, MIT Press, 2006
60 ///
61 /// C.M. Bishop, Pattern Recognition and Machine Learning, section
62 /// 6.4.3, Springer, 2006
63 ///
64 /// The regularization parameter can be encoded in different ways.
65 /// The exponential encoding is the proper choice for unconstraint optimization.
66 /// Be careful not to mix up different encodings between trainer and evidence.
67 template<class InputType = RealVector, class OutputType = RealVector, class LabelType = RealVector>
69 {
70 public:
73 
74  /// \param dataset: training data for the Gaussian process
75  /// \param kernel: pointer to external kernel function
76  /// \param unconstrained: exponential encoding of regularization parameter for unconstraint optimization
78  DatasetType const& dataset,
79  KernelType* kernel,
80  bool unconstrained = false
81  ): m_dataset(dataset)
82  , mep_kernel(kernel)
83  , m_unconstrained(unconstrained)
84  {
86  setThreshold(0.);
87  }
88 
89  /// \brief From INameable: return the class name.
90  std::string name() const
91  { return "NegativeGaussianProcessEvidence"; }
92 
93  std::size_t numberOfVariables()const{
94  return 1+ mep_kernel->numberOfParameters();
95  }
96 
97  /// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
98  /// \f$t\f$ the label vector. For the evidence we have: \f[ E= 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
99  double eval(const RealVector& parameters) const {
100  std::size_t N = m_dataset.numberOfElements();
101  std::size_t kp = mep_kernel->numberOfParameters();
102  // check whether argument has right dimensionality
103  SHARK_ASSERT(1+kp == parameters.size());
104 
105  // keep track of how often the objective function is called
107 
108  //set parameters
109  RealVector kernelParams(kp);
110  double betaInv = 0;
111  blas::init(parameters) >> kernelParams, betaInv;
112  if(m_unconstrained)
113  betaInv = std::exp(betaInv); // for unconstraint optimization
114  mep_kernel->setParameterVector(kernelParams);
115 
116 
117  //generate kernel matrix and label vector
118  RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
119  //~ RealVector t = generateLabelVector();
120  RealVector t = column(createBatch<RealVector>(m_dataset.labels().elements()),0);
121 
122  RealMatrix choleskyFactor(N,N);
123  choleskyDecomposition(M, choleskyFactor);
124 
125  //compute the determinant of M using the cholesky factorization M=AA^T:
126  //ln det(M) = 2 trace(ln A)
127  double logDet = 2* trace(log(choleskyFactor));
128 
129  //we need to compute t^T M^-1 t
130  //= t^T (AA^T)^-1 t= t^T (A^-T A^-1)=||A^-1 t||^2
131  //so we will first solve the triangular System Az=t
132  //and then compute ||z||^2
133  //since we don't need t anymore after that, we solve in-place and omit z
134  blas::solveTriangularSystemInPlace<blas::SolveAXB,blas::lower>(choleskyFactor,t);
135 
136  // equation (6.69) on page 311 in the book C.M. Bishop, Pattern Recognition and Machine Learning, Springer, 2006
137  // e = 1/2 \cdot [ -log(det(M)) - t^T M^{-1} t - N log(2 \pi) ]
138  double e = 0.5 * (-logDet - norm_sqr(t) - N * std::log(2.0 * M_PI));
139 
140  // return the *negative* evidence
141  return -e;
142  }
143 
144  /// Let \f$M\f$ denote the regularized (kernel Gram) covariance matrix.
145  /// For the evidence we have:
146  /// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
147  /// For a kernel parameter \f$p\f$ and \f$C = \beta^{-1}\f$ we get the derivatives:
148  /// \f[ dE/dC = 1/2 \cdot [ -tr(M^{-1}) + (M^{-1} t)^2 ] \f]
149  /// \f[ dE/dp = 1/2 \cdot [ -tr(M^{-1} dM/dp) + t^T (M^{-1} dM/dp M^{-1}) t ] \f]
150  double evalDerivative(const RealVector& parameters, FirstOrderDerivative& derivative) const {
151  std::size_t N = m_dataset.numberOfElements();
152  std::size_t kp = mep_kernel->numberOfParameters();
153 
154  // check whether argument has right dimensionality
155  SHARK_ASSERT(1 + kp == parameters.size());
156  derivative.resize(1 + kp);
157 
158  // keep track of how often the objective function is called
160 
161  //set parameters
162  RealVector kernelParams(kp);
163  double betaInv = 0;
164  blas::init(parameters) >> kernelParams, betaInv;
165  if(m_unconstrained)
166  betaInv = std::exp(betaInv); // for unconstraint optimization
167  mep_kernel->setParameterVector(kernelParams);
168 
169 
170  //generate kernel matrix and label vector
171  RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
172  //~ RealVector t = generateLabelVector();
173  RealVector t = column(createBatch<RealVector>(m_dataset.labels().elements()),0);
174 
175  //new way to compute inverse and logDetM
176  RealMatrix choleskyFactor(N,N);
177  choleskyDecomposition(M, choleskyFactor);
178  //we dont need M anymore, so save a lot of memory by freeing the memory of M
179  M=RealMatrix();
180 
181  // compute derivative w.r.t. kernel parameters
182  //the derivative is defined as:
183  //dE/da = -tr(IM dM/da) +t^T IM dM/da IM t
184  // where IM is the inverse matrix of M, tr is the trace and a are the parameters of the kernel
185  //by substituting z = IM t we can expand the operations to:
186  //dE/da = -(sum_i sum_j IM_ij * dM_ji/da)+(sum_i sum_j dM_ij/da *z_i * z_j)
187  // = sum_i sum_j (-IM_ij+z_i * z_j) * dM_ij/da
188  // with W = -IM + zz^T we get
189  // dE/da = sum_i sum_j W dM_ij/da
190  //this can be calculated as blockwise derivative.
191 
192  //compute inverse matrix from the cholesky dcomposition
193  //using forward-backward substitution,
194  RealMatrix W=RealIdentityMatrix(N);
195  blas::solveTriangularCholeskyInPlace<blas::SolveAXB>(choleskyFactor,W);
196 
197  //calculate z = Wt=M^-1 t
198  RealVector z = prod(W,t);
199 
200  // W is already initialized as the inverse of M, so we only need
201  // to change the sign and add z. to calculate W fully
202  W*=-1;
203  W+=outer_prod(z,z);
204 
205 
206  //now calculate the derivative
207  RealVector kernelGradient = 0.5*calculateKernelMatrixParameterDerivative(*mep_kernel,m_dataset.inputs(),W);
208 
209  // compute derivative w.r.t. regularization parameter
210  //we have: dE/dC = 1/2 * [ -tr(M^{-1}) + (M^{-1} t)^2
211  // which can also be written as 1/2 tr(W)
212  double betaInvDerivative = 0.5 * trace(W) ;
213  if(m_unconstrained)
214  betaInvDerivative *= betaInv;
215 
216  //merge both derivatives and since we return the negative evidence, multiply with -1
217  blas::init(derivative)<<kernelGradient,betaInvDerivative;
218  derivative *= -1.0;
219 
220  // truncate gradient vector
221  for(std::size_t i=0; i<derivative.size(); i++)
222  if(std::abs(derivative(i)) < m_derivativeThresholds(i)) derivative(i) = 0;
223 
224  // compute the evidence
225  //compute determinant of M (see eval for why this works)
226  double logDetM = 2* trace(log(choleskyFactor));
227  double e = 0.5 * (-logDetM - inner_prod(t, z) - N * std::log(2.0 * M_PI));
228  return -e;
229  }
230 
231  /// set threshold value for truncating partial derivatives
232  void setThreshold(double d) {
233  m_derivativeThresholds = RealVector(mep_kernel->numberOfParameters() + 1, d); // plus one parameter for the prior
234  }
235 
236  /// set threshold values for truncating partial derivatives
237  void setThresholds(RealVector &c) {
238  SHARK_ASSERT(m_derivativeThresholds.size() == c.size());
239  m_derivativeThresholds = c;
240  }
241 
242 
243 private:
244  /// pointer to external data set
245  DatasetType m_dataset;
246 
247  /// thresholds for setting derivatives to zero
248  RealVector m_derivativeThresholds;
249 
250  /// pointer to external kernel function
251  KernelType* mep_kernel;
252 
253  /// Indicates whether log() of the regularization parameter is
254  /// considered. This is useful for unconstraint
255  /// optimization. The default value is false.
256  bool m_unconstrained;
257 };
258 
259 
260 }
261 #endif